In [None]:
# Import the required libraries
import bs4
import urllib3
import csv
import numpy as np
urllib3.disable_warnings()
http = urllib3.PoolManager()

# ntlk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist

#### Sanitizing and Sorting Output

In [None]:
def sanitize_input(data):
    replace = {
        ord('\f') : ' ', 
        ord('\t') : ' ',
        ord('\n') : ' ',
        ord('\r') : None
    }
    return data.translate(replace)

def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def tokenize_content(content):
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = word_tokenize(content.lower())
    return [word for word in words if word not in stop_words]

def word_freq(content):
    filtered_words = tokenize_content(content)
    word_freq = FreqDist(filtered_words)
    return word_freq

def filterWords(allWords):
    removewords = ['p','div','while','total','been','e','our','like','new','which','help','s','all','some','if','what','about','only','on','then','will','no','at','a','for','us','not','etc','we','that','it','the','of','as','an','may','have','has','this','other','from','with','its','be','in','is','am','now','you','some','was','can','are','but','they','he','she','where','when','and','or','them','how','by','to']
    selWords = [word for word in allWords if word not in removewords]
    selWords = [word for word in selWords if not str.isdigit(word)]
    return selWords
def filtercharacter(string):
    removechar = ['"',"'",'?',',','‘','’','-','(',')',':','—','/','<','>']
    string = [c if c not in removechar else ' ' for c in string ]
    return ''.join(string)

#### Link of Pages with list of JDs

In [None]:
def url_query(job):
    Url_List = []
    job = job.strip().replace(" ","+")
    base_url = 'https://www.indeed.co.in/jobs?l=India&q='
    job_url_default = base_url+job
    Url_List.append(job_url_default)
    job_url_page = job_url_default+'&start='
    for i in range(1,2):             
        Url_List.append(job_url_page+str(10*i))
    return Url_List

#### Extract JD from given List

In [None]:
def JDfrompage (page_url):
    r = http.request('GET', page_url)
    source = r.data
    soup = bs4.BeautifulSoup(source, "lxml")
    all_div = [div for div in soup.findAll('div',{"class": "jobsearch-SerpJobCard"}) ]
    all_jk = ['https://www.indeed.co.in/viewjob?jk='+div.get('data-jk') for div in all_div]
    return all_jk

#### All JD of given job

In [None]:
def get_JDs(JD_URL_List):    
    res = []
    for url in JD_URL_List:
        print(url)
        r = http.request('GET',url)
        source = r.data
        if(r.status==200):
            soup = bs4.BeautifulSoup(source, "lxml")
            JD = soup.find('div',{"class": "jobsearch-JobComponent-description"})
            JD_content = JD.contents[0].encode('utf-8').decode()
            JD_content = sanitize_input(JD_content)
            JD_content = remove_html_tags(JD_content)
            res.append(JD_content)
    return res

#### Creating list of all jobs and Skills

In [None]:
Jobtype = ["Android Developer"
           ,"Web Developer"
           ,"Consultant"
           ,"Finance"
           ,"Data Scientist"
           ,"Business Analyst"
           ,"Designer"
           ,"Mechanical Engineer"
           ,"Electrical Engineer"
           ,"Civil Engineer"
           ,"Chemical Engineer"
           ,"Software Developer"
           ,"Game Developer"]
Skills = ['Mumbai'
         ,'Delhi'
         ,'Bangalore'
         ,'Kochi'
         ,'Noida']

In [None]:
JobAndJDs = {}
for job in Jobtype:
    JD_URL_List = []
    for url in url_query(job):
        x = JDfrompage(url)   # list of all jobs from that url
        JD_URL_List = JD_URL_List+x
    all_JD = get_JDs(JD_URL_List)
    JobAndJDs[job] = all_JD
    print(job+ ":- Total JDs = "+str(len(all_JD)))

In [None]:
def skillinJd(jd,skills):
    skillMatrix = []
    for c in skills:
        if(jd.count(c)>0):
            skillMatrix.append(1.0)
        else:
            skillMatrix.append(0.0)
#     print(np.array(skillMatrix))
    return np.array(skillMatrix)

In [None]:
CareerSkillMatrix = {}
for career in JobAndJDs.keys():
    SkillMatrix = np.array([0]*len(Skills))
    for jd in JobAndJDs[career]:
        SkillMatrix = np.add(SkillMatrix,skillinJd(jd,Skills))
    SkillMatrix = np.divide(SkillMatrix,len(JobAndJDs[c]))
    CareerSkillMatrix[career] = SkillMatrix

In [None]:
# All_Resume = {}
# x = ("Machine learning is an application of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience without being explicitly programmed. Machine learning focuses on the development of computer programs that can access data and use it learn for themselves. The process of learning begins with observations or data, such as examples, direct experience, or instruction,"
#      "in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers learn automatically without human intervention or assistance and adjust actions accordingly. Some machine learning methods Machine learning algorithms are often categorized as supervised or unsupervised. Supervised machine learning algorithms can apply what ha"
#      "s been learned in the past to new data using labeled examples to predict future events. Starting from the analysis of a known training dataset, the learning algorithm produces an inferred function to make predictions about the output values. The system is able to provide targets for any new input after sufficient training. The learning algorithm can also compare its output with the correct, intended output and find "
#      "errors in order to modify the model accordingly. In contrast, unsupervised machine learning algorithms are used when the information used to train is neither classified nor labeled. Unsupervised learning studies how systems can infer a function to describe a hidden structure from unlabeled data. The system doesn’t figure out the right output, but it explores the data and can draw inferences from datasets to describe"
#      "hidden structures from unlabeled data. Semi-supervised machine learning algorithms fall somewhere in between supervised and unsupervised learning, since they use both labeled and unlabeled data for training – typically a small amount of labeled data and a large amount of unlabeled data. The systems that use this method are able to considerably improve learning accuracy. Usually, semi-supervised learning is chosen when"
#      " the acquired labeled data requires skilled and relevant resources in order to train it / learn from it. Otherwise, acquiringunlabeled data generally doesn’t require additional resources. Reinforcement machine learning algorithms is a learning method that interacts with its environment by producing actions and discovers errors or rewards. Trial and error search and delayed reward are the most relevant characteristics of"
#      "reinforcement learning. This method allows machines and software agents to automatically determine the ideal behavior within a specific context in order to maximize its performance. Simple reward feedback is required for the agent to learn which action is best; this is known as the reinforcement signal."
#      "Machine learning enables analysis of massive quantities of data. While it generally delivers faster, more accurate results in order to identify profitable opportunities or dangerous risks, it may also require additional time and resources to train it properly. Combining machine learning with AI and cognitive technologies can make it even more effective in processing large volumes of information.")
# y = ("Machine learning Vaniyambadi, Tamil Nadu Secure a responsible career opportunity to fully utilize my training and skills, while making a significant contribution to the success of the company"
#      "Work Experience Machine learning traing Present first position Education BCA in art Concordia Higher Secondary School Vaniyambadi, Tamil Nadu 2016 to 2019 S.S.L.C in State Board Concordia Higher Secondary School Vaniyambadi, Tamil Nadu"
#      "2014 Skills C++ (Less than 1 year), CSS (Less than 1 year), HTML (Less than 1 year), JAVA (Less than 1 year), JAVASCRIPT (Less than 1 year) Additional Information Technical Skill • Hardware And Networking • Typing Lower • C, C++, JAVA, VISUAL PROGRAMMING, JAVASCRIPT, HTML, CSS"
#      "• OS-LINUX BASIC OPERTORS")
# All_Resume['Machine Learning'] = [x,y]
# All_Resume['Deep Learning'] = [x,y]
# Total_Resume = 4
All_Resume = {}
Total_Resume = 0