# jobGrab - Project to compile job search results

## Imports

In [4]:
# Imports

from lxml import html
import requests
import bs4 as bs
import urllib.request
import regex as re
import json
from datetime import date
from datetime import timedelta
import xlsxwriter as xw
# from bitly_api import bitly_api

## Helper Functions

In [6]:
def removeNL(string): # Helper function to remove new line
    removeChars = ["\n", "\r"]
    for i in range(0, 5): 
        for char in removeChars:
            string = re.sub(char + '$', '', string)
            string = re.sub('^' + char, '', string)
            
    return string

In [7]:
def createSoup(URL): # Create HTML parsed BS object from URL
    source = urllib \
        .request \
        .urlopen(URL) \
        .read()
    return bs.BeautifulSoup(source, 'html.parser')

In [8]:
def initializeJobsToIgnore(): # Creates array of jobs to ignore
    file = open("src/jobsToIgnore.txt", "r")
    jobsToIgnoreRaw = file.readlines()
    file.close()
    
    jobsToIgnore = []
    
    for job in jobsToIgnoreRaw:
        jobsToIgnore.append(job[1:(len(job) - 2)])
    
    return jobsToIgnore

In [9]:
def checkJobsToIgnore(job, jobsToIgnore):
    if ((job.title + job.company) in jobsToIgnore):
        return False
    else:
        return True    

## Indeed Scraping

In [6]:
class IndeedJob:
    def __init__(self, ID, soup):
        self.id = ID
        self.posting = soup.find(attrs={"data-jk": ID})
        self.platform = "Indeed"
        
        self.setTitle()
        self.setCompany()
        
        if not checkJobsToIgnore(self):
            self.valid = False
            return
        else:
            self.valid = True
            
        self.setDatePosted()
        self.setLocation()
        self.setDetailedSoup()
        self.setDescription()
        self.setApply()
       
    
    def setTitle(self):
        self.title = self.posting.find("h2", class_ = "title").text
        self.title = removeNL(self.title)
    
        
    def setDatePosted(self):
        self.datePosted = self.posting.find("span", class_ = "date").text  
        today = date.today()
        
        if (self.datePosted == "Today" or self.datePosted == "Just posted"):
            self.datePosted = today.strftime("%m/%d/%y")
        elif (self.datePosted[0:2].isdigit()): # Checks if more than 9 days ago
            datetimePosted = today - timedelta(days=int(self.datePosted[0:2])) # Gets datetime object
            self.datePosted = datetimePosted.strftime("%m/%d/%y")
        else:
            datetimePosted = today - timedelta(days=int(self.datePosted[0])) # Gets datetime object
            self.datePosted = datetimePosted.strftime("%m/%d/%y")
          
        
    def setLocation(self):
        def checkLocation(class_):
            return class_ is not None and "location" in class_
        self.location = self.posting.find(class_ = checkLocation).text 
        
        
    def setCompany(self):
        self.company = self.posting.find("span", class_ = "company").text
        self.company = removeNL(self.company)
        
        
    def setDetailedSoup(self):# Soup for the separate page for viewing job description
        self.detailsURL = "https://ca.indeed.com/viewjob" + "?jk=" + self.id
        self.detailedSoup = createSoup(self.detailsURL)
        
        
    def setDescription(self):   
        descriptionTextDiv = self.detailedSoup.find("div", {"id" : "jobDescriptionText"})
        self.description = ""
        for element in descriptionTextDiv.findAll(['p', 'li']): # Creates description adding newlines between paragraphs
            if (element.name is 'p'):
                self.description += element.text + 2*chr(10) # 10 is new line character
            else:
                self.description += element.text + chr(10)
            
            
    def setApply(self):
        applyLinkDiv = self.detailedSoup.find("div", {"id" : "viewJobButtonLinkContainer"})
        
        if applyLinkDiv is not None:
            self.applyLink = applyLinkDiv.find("div", class_ = "icl-u-lg-hide").find('a').get('href')
        else:
            self.applyLink = self.detailsURL

In [7]:
def getIndeedJobs(searchTerm):
    filteredTerm = ""
    for letter in searchTerm: # Replacing spaces with +s for indeed query
        if (letter != " "):
            filteredTerm += letter
        else:
            filteredTerm += "+"
    
    soup = createSoup("https://ca.indeed.com/jobs?q=" + filteredTerm + "&l=Canada&sort=date")
        
    for script in soup.find_all("script", {"src":False}):
        if ("jobKeysWithInfo['" in script.text): # This is where jobIDs are stored
            jobIDsHTML = script.text
            
    # Create array of jobs
    jobIDRawPattern = re.compile(r"^(.+?)jobKeysWithInfo\['(.+?)'\](.+?)$", re.MULTILINE | re.DOTALL)
    jobIDsRaw = re.findall(jobIDRawPattern, jobIDsHTML)
    jobIDPattern = re.compile(r"^[A-Fa-f0-9]{16}$")
    jobs = []
    
    for row in jobIDsRaw:
        for entry in row:
            if re.match(jobIDPattern, entry):                
                # Creating job
                job = IndeedJob(entry, soup)
                if (job.valid):
                    jobs.append(job)    

    return jobs

## Monster Scraping

In [64]:
class MonsterJob:
    def __init__(self, ID, posting):
        self.id = ID
        self.posting = posting
        self.platform = "Monster"
        
        self.setTitle()
        self.setCompany()
        
#         if not checkJobsToIgnore(self):
#             self.valid = False
#             return
#         else:
#             self.valid = True
            
        self.setDatePosted()
        self.setLocation()
        self.setDetailedSoup()
        self.setDescription()
        self.setApply()
        
        
    def setTitle(self):
        self.titleSoup = self.posting.find("h2", class_ = "title") 
        self.title = removeNL(self.titleSoup.text)
        
        
    def setDatePosted(self):
        self.datePosted = self.posting.find("time").text  
        today = date.today()

        if (self.datePosted[0:2].isdigit()): # Checks if more than 9 days ago
            datetimePosted = today - timedelta(days=int(self.datePosted[0:2])) # Gets datetime object
            self.datePosted = datetimePosted.strftime("%m/%d/%y")
        elif (self.datePosted[0].isdigit()):
            datetimePosted = today - timedelta(days=int(self.datePosted[0])) # Gets datetime object
            self.datePosted = datetimePosted.strftime("%m/%d/%y")
        else:
            self.datePosted = today.strftime("%m/%d/%y")          
        
        
    def setLocation(self):
        locationClass = self.posting.find("div", class_ = "location")
        self.location = removeNL(locationClass.find("span", class_ = "name").text)
        self.location = self.location.title() # Convers to proper case
        
        
    def setCompany(self):
        locationClass = self.posting.find("div", class_ = "company")
        self.company = removeNL(locationClass.find("span", class_ = "name").text)
    
    
    def setDetailedSoup(self):# Soup for the separate page for viewing job description
        self.detailsURL = self.titleSoup.find("a").get("href")
        try: # Causes error if non ASCII characters are in the link
            self.detailedSoup = createSoup(self.detailsURL)
        except:
            self.detailedSoup = None
        
        
    def setDescription(self):   
        if (self.detailedSoup is None):
            self.description = None
            return;        
        
        # Creates description adding newlines between paragraphs
        descriptionTextDiv = self.detailedSoup.findAll(['p', 'li'])
        
        if (descriptionTextDiv is not None):
            self.description = ""
        else:
            self.description = None

        for element in descriptionTextDiv[14:(len(descriptionTextDiv) - 1)]: # Slices to only get necessary text
            if (element.name is 'p'):
                self.description += element.text + 2*chr(10) # 10 is new line character
            else:
                self.description += element.text + chr(10)           
            
    def setApply(self):        
        if (self.detailedSoup is None or ("applyOnlineUrl" in str(self.detailedSoup)) is False):
            self.applyLink = None
            return
        else:
            applyLinkHTML = str(self.detailedSoup)
    
        applyLinkRawPattern = re.compile(r"^(.+?)applyOnlineUrl\":\"(.+?)\",\"applyType(.+?)$", re.MULTILINE | re.DOTALL) # Regex for where the URL is stored

        applyLinkRaw = re.search(applyLinkRawPattern, applyLinkHTML) # Raw pattern
        applyLinkRaw = applyLinkRaw.group(2).replace("u002F", "") # Removes u002F characters

        # If there is an ad link, remove the ad part 
        applyLinkAdPattern = re.compile(r"^https:(.+?)ad.doubleclick.net(.+?)\?(.+?)$", re.MULTILINE | re.DOTALL)
        applyLinkAdRaw = re.search(applyLinkAdPattern, applyLinkRaw)

        # Set the final apply links
        if applyLinkAdRaw is None:
            self.applyLink = applyLinkRaw
        else:
            self.applyLink = applyLinkAdRaw.group(3)
            
        self.applyLink = self.applyLink.replace("\\", "/")

        if len(self.applyLink) >= 250:
            self.applyLink = shortenLink(self.applyLink)

  if (element.name is 'p'):


In [46]:
def getMonsterJobs(searchTerm):
    filteredTerm = ""
    for letter in searchTerm: # Replacing spaces with +s for indeed query
        if (letter != " "):
            filteredTerm += letter
        else:
            filteredTerm += "-"
    
    soup = createSoup("https://www.monster.ca/jobs/search/?q=" + filteredTerm + "&where=Canada")
    
    # Creating jobs array
    jobs = []
    for section in soup.find_all("section"):
        jobID = section.get("data-jobid")
        if jobID is not None:
            job = MonsterJob(jobID, section)
#             if (job.valid):
            jobs.append(job)
    return jobs

## LinkedIn Scraping

In [10]:
class LinkedInJob:
    def __init__(self, ID, posting):
        self.id = ID
        self.posting = posting
        self.platform = "LinkedIn"
        
        self.setTitle()
        self.setCompany()
        
        if not checkJobsToIgnore(self):
            self.valid = False
            return
        else:
            self.valid = True
            
        self.setDetailedSoup()
        self.setDatePosted()
        self.setLocation()
        self.setDescription()
        self.setApply()
        
        
    def setTitle(self):
        self.title = self.posting.find("span", class_ = "screen-reader-text").text        
        
        
    def setDatePosted(self):
        def checkDateClass(class_):
            return class_ is not None and "listdate" in class_ 
        
        self.datePosted = self.posting.find("time", class_ = checkDateClass).text        
        today = date.today()
        
        if ("hour" in self.datePosted):
            self.datePosted = today.strftime("%m/%d/%y")
        elif (self.datePosted[0:2].isdigit()): # Checks if more than 9 days ago
            datetimePosted = today - timedelta(days=int(self.datePosted[0:2])) # Gets datetime object
            self.datePosted = datetimePosted.strftime("%m/%d/%y")
        else:
            datetimePosted = today - timedelta(days=int(self.datePosted[0])) # Gets datetime object
            self.datePosted = datetimePosted.strftime("%m/%d/%y")
        
        
    def setLocation(self):
        self.location = self.posting.find("span", class_ = "job-result-card__location").text
        
        
    def setCompany(self):
        def checkCompanyClass(class_):
            return class_ is not None and "result-card__subtitle" in class_ 
        
        companyDiv = self.posting.find("h4", checkCompanyClass)
        if (companyDiv is not None):
            self.company = companyDiv.text
        else:
            self.company = None
    
    
    def setDetailedSoup(self):# Soup for the separate page for viewing job description
        self.detailsURL = self.posting.find("a", class_ = "result-card__full-card-link").get("href")
        self.detailedSoup = createSoup(self.detailsURL)
       
        
    def setDescription(self):  
        descriptionTextDiv = self.detailedSoup.find("section", class_ = "description").findAll(['p', 'li'])
        
        if (descriptionTextDiv is not None):
            self.description = ""
            for element in descriptionTextDiv:
                if (element.name is 'p'):
                    self.description += element.text + 2*chr(10) # 10 is new line character
                else:
                    self.description += element.text + chr(10)   
        else:
            self.description = None
             
        
    def setApply(self):
        applyDiv = self.detailedSoup.find("a", class_ = "apply-button apply-button--link")
        if (applyDiv is not None):
            self.applyLink = shortenLink(applyDiv.get("href")) # Will enact when in prduction to prevent wasted API calls
            # self.applyLink = applyDiv.get("href")
        else:
            self.applyLink = self.detailsURL
        


In [11]:
def getLinkedInJobs(searchTerm):
    filteredTerm = ""
    for letter in searchTerm: # Replacing spaces with %20s for indeed query
        if (letter != " "):
            filteredTerm += letter
        else:
            filteredTerm += "%20"

    URL = "https://www.linkedin.com/jobs/search/?geoId=101174742&keywords=" + filteredTerm + "&location=Canada&sortBy=DD&f_TP=1%2C2&redirect=false&position=1&pageNum=0"
    
    try: # Perhaps handle differently later
        soup = createSoup(URL)
    except:
        return []
    
    def postingCheck(entityUrn):
        return entityUrn is not None and "jobPosting" in entityUrn
    jobPostings = soup.findAll("li", {"data-entity-urn" : postingCheck}) #jobPostings follow this
    
    # Creating jobs array
    jobs = []
    for posting in jobPostings:
        jobID = posting.get("data-id")
        if jobID is not None:
            job = LinkedInJob(jobID, posting)
            if (job.valid):
                jobs.append(job)
    
    return jobs

## Getting List of Jobs

In [12]:
# Imports

import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [13]:
searchTerm = "Engineering Intern"
indeedJobs =  getIndeedJobs(searchTerm)
monsterJobs = getMonsterJobs(searchTerm)
linkedInJobs = getLinkedInJobs(searchTerm)

In [14]:
jobs = indeedJobs + monsterJobs + linkedInJobs

In [15]:
def removeSimilarJobs(jobs):  # Takes in a jobs array and returns the array with duplicate jobs removed
    def cleanString(description): # Helper function - Normalizes text
        description = ''.join([word for word in description if word not in string.punctuation])
        description = description.lower()

        temp = []
        for word in description.split():
            if word not in stopwords:
                temp.append(word)

        description = ' '.join(temp)

        return description
    
    def cosineSimVectors(vec1, vec2): # Helper function - Retrieve similarity (0-1)
        vec1 = vec1.reshape(1, -1)
        vec2 = vec2.reshape(1, -1)

        return cosine_similarity(vec1, vec2)[0][0]
    
    
    jobDescriptions = [job.description for job in jobs]
    cleanDescriptions = list(map(cleanString, jobDescriptions))  
    
    vectorizer = CountVectorizer().fit_transform(cleanDescriptions)
    vectors = vectorizer.toarray()
    
    indicesToRemove = []
    for i in range(0, len(vectors)):
        for j in range(i + 1, len(vectors)):
            if (cosineSimVectors(vectors[i], vectors[j]) > 0.9):
                indicesToRemove.append(j)

    indicesToRemove = list(set(indicesToRemove))
    
    jobsToRemove = [] # Must do this in two loops because dynamically removing changes indexing
    for index in indicesToRemove:
        jobsToRemove.append(jobs[index])
    for job in jobsToRemove:
        jobs.remove(job)
    
    return jobs

In [16]:
jobs = removeSimilarJobs(jobs)

## Writing to Excel File

In [17]:
data = []

for job in jobs:
    row = []
    
    row.append(job.title)
    row.append(job.company)
    row.append(job.datePosted)
    row.append(job.location)
    row.append(job.description)
    row.append(job.applyLink)
    row.append(job.platform)
    
    data.append(row)

In [20]:
fileSavePath = 'C:/Users/Rahul Behal/Desktop/jobs.xlsm'

workbook = xw.Workbook(fileSavePath)

worksheet = workbook.add_worksheet()

tableLength = str(len(jobs) + 1) # Amount of objects in the table + 1 for header
tableRange = 'A1:G' + tableLength 
tableDataRange = 'A2:G' + tableLength


worksheet.add_table(tableRange, {'data': data,
                                 'columns': [{'header' : 'Position Title'},
                                             {'header' : 'Company'},
                                             {'header' : 'Date Posted'},
                                             {'header' : 'Location'},
                                             {'header' : 'Description'},
                                             {'header' : 'Apply Link'},
                                             {'header' : 'Platform'}],
                                 'style' : "Table Style Light 8"
                                 }) # Each job has 7 (G) attributes


# Conditional formatting based on platform
worksheet.conditional_format(tableDataRange, {'type':     'formula',
                                              'criteria': '=$G2="LinkedIn"',
                                              'format':   workbook.add_format({'bg_color' : '#FFB3B3'})})
worksheet.conditional_format(tableDataRange, {'type':     'formula',
                                              'criteria': '=$G2="Indeed"',
                                              'format':   workbook.add_format({'bg_color' : '#B3FFCC'})})
worksheet.conditional_format(tableDataRange, {'type':     'formula',
                                              'criteria': '=$G2="Monster"',
                                              'format':   workbook.add_format({'bg_color' : '#CC80FF'})})


# Column Formatting
cFormat = workbook.add_format({'align' : 'right', 'num_format' : 'mm/dd/yy'})# Column C (Date)
eFormat = workbook.add_format({'text_wrap' : True, 'valign' : 'top'})# Column E (Description)


# Adjusting column widths
worksheet.set_column(0, 0, 50)
worksheet.set_column(1, 1, 30)
worksheet.set_column(2, 2, 12, cFormat)
worksheet.set_column(3, 3, 15)
worksheet.set_column(4, 4, 70, eFormat)
worksheet.set_column(5, 5, 11)
worksheet.set_column(6, 6, 10)

# Adding VBA macro button for closing
workbook.add_vba_project('./vbaProject.bin') 

worksheet.insert_button('E'+str(int(tableLength) + 2), {'macro':   'addJobsToIgnore.addJobsToIgnore',
                                                        'caption': 'Done',
                                                        'width':   200,
                                                        'height':  100})

workbook.close()