In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import html2text
from datetime import date
from datetime import time
from datetime import datetime
from datetime import timedelta
import csv
import re
from fuzzywuzzy import fuzz
import threading
import concurrent.futures

In [2]:
import sys
import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger('jobExecutor')

In [3]:
# define a job object
class Job(object):
    name = ""
    url = ""
    description = ""
    company = ""
    location = ""
    posted = ""
    closes = ""
    stage = "New"
    fuzzyWuzzy = 0
    
    def __init__(self, name):
        self.name = name
        logger.debug(f"Job {self.name} created")
        
        
    # mostly just for quick debugging
    def __str__(self):
        return f'Name: {self.name}\n URL: {self.url}'
    
    def getDataloader(self):
        return [self.name,
               self.url,
               self.description,
               self.posted.strftime("%m/%d/%Y"),
               self.closes.strftime("%m/%d/%Y"),
               self.stage]


In [4]:
# create a subclass for indeed jobs
class IndeedJob(Job):
    def retrieveDetails(self):
        logger.debug(f"retrieving details from {self.url}")
        try:
            response = requests.get(self.url)
            soup = BeautifulSoup(response.text, "html.parser")
            div = soup.find('div', attrs={'class':'jobsearch-JobComponent-description'})
            self.description = html2text.html2text(div.prettify())
        except Exception as e:
            print("Indeedjob.retriveDetails exception:")
            print(e)
            print(self)
            
# create a subclass for indeed jobs
class CareerBuilderJob(Job):
    def retrieveDetails(self):
        logger.debug(f"retrieving details from {self.url}")
        try:
            # then follow the link for the rest of the details
            response = requests.get(self.url)
            soup = BeautifulSoup(response.text, "html.parser")
            desc = soup.find('div',  attrs={'id':'jdp_description'})
            self.description = html2text.html2text(desc.div.div.prettify())
            
            # the company and location are burried in unidentified spans
            details = soup.find('div', class_="data-details")
            spans = details.find_all('span')
            self.company = spans[0].text
            self.location = spans[1].text
        except Exception as e:
            print("CareerBuilderJob.retriveDetails exception:")
            print(e)
            print(self)
            
# create a subclass for indeed jobs
class MonsterJob(Job):
    def retrieveDetails(self):
        logger.debug(f"retrieving details from {self.url}")
        try:
            response = requests.get(self.url)
            soup = BeautifulSoup(response.text, "html.parser")
            div = soup.find('div', attrs={'name':'sanitizedHtml'})
            self.description = html2text.html2text(div.prettify())
        except Exception as e:
            print("MonsterJob.retriveDetails exception:")
            print(e)
            print(self)

            
# create a subclass for indeed jobs
class GlassDoorJob(Job):
    def retrieveDetails(self):
        logger.debug(f"retrieving details from {self.url}")
        try:
            hdrs = {'user-agent': 'Mozilla/5.0'}
            reponse = requests.get(self.url, headers=hdrs)
            soup = BeautifulSoup(reponse.text, "html.parser")
            div = soup.find('div', attrs={'id':'JobDescriptionContainer'})
            self.description = html2text.html2text(div.prettify())
        except Exception as e:
            print("GlassDoorJob.retriveDetails exception:")
            print(e)
            print(self)
            


In [5]:
# get the jobs from indeed
def getIndeedJobs(what, maxAge=1):
    request = f"https://www.indeed.com/jobs?q={what}&l=remote&fromage={maxAge}"
    baseURL = "https://www.indeed.com"
    iJobs = []
    
    logger.debug(f"getting jobs from {request}")
    
    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")
    
    # for every job card
    for card in soup.find_all('div', attrs={'data-tn-component':'organicJob'}):
        # find the job details
        a = card.find('a', attrs={'data-tn-element':'jobTitle'})
        c = card.find('span', attrs={'class':'company'})
        l = card.find('span', attrs={'class':'location'})
        d = card.find('span', attrs={'class':'date'})
        
        # figure out the dates
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "JUST POSTED" or d.text.strip().upper() == "TODAY"):
                pass
            else:
                logger.info(f"too old: [{d.text.strip()}] {a.text.strip()}")
                continue # just ignore any jobs more than 30 days old
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=2)
        
        # get the fuzzy wuzzy score for refining
        f = fuzz.ratio(what, a.text.strip())
        logger.debug(f"indeed: [{f}] {a.text.strip()}")
        
        # create a job object
        j = IndeedJob(a.text.strip())
        j.url = f"{baseURL}{a['href']}"
        j.company = c.text.strip()
        j.location = l.text.strip()
        j.posted = posted
        j.closes = closes
        j.fuzzyWuzzy = f
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        iJobs.append(j)
        
    return iJobs

In [6]:
# get the jobs from career builder
def getCareerBuilderJobs(what, maxAge=1):
    request = f"https://www.careerbuilder.com/jobs?posted={maxAge}&pay=&cat1=&radius=&emp=&cb_apply=false&keywords={what}&location=&cb_workhome=true"
    baseURL = "https://www.careerbuilder.com"
    cJobs = []
    
    logger.debug(f"getting jobs from {request}")
    
    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")

    for card in soup.find_all('div', class_="data-results-content-parent"):
        # find the particulars
        n = card.find('div', class_="data-results-title")
        a = card.find('a', class_="data-results-content")
        d = card.find('div', class_="data-results-publish-time")
        
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "TODAY"):
                pass
            else:
                logger.info(f"too old: [{d.text.strip()}] {n.text.strip()}")
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=2)
        
        # career builder includes a lot of jobs regardless of age, so ignore those
        if (age > maxAge):
            logger.info(f"ignoring because it's too old:\n\t{request}")
            continue
        
        # get the fuzzy wuzzy score for refining
        f = fuzz.ratio(what, n.text.strip())
        logger.debug(f"career builder: [{f}] {n.text.strip()}")
        
        # create the job
        j = CareerBuilderJob(n.text.strip())
        j.url = f"{baseURL}{a['href']}"
        j.posted = posted
        j.closes = closes
        j.fuzzyWuzzy = f
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        cJobs.append(j)
    
    return cJobs

In [7]:
# get the jobs from monster
def getMonsterJobs(what, maxAge=1):
    request = f"https://www.monster.com/jobs/search/?q={what}&tm={maxAge}"
    baseURL = "https://www.monster.com"
    mJobs = []
    
    logger.debug(f"getting jobs from {request}")
    
    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")
    
    for card in soup.find_all('div', class_="flex-row"):
        # extract the card details
        n = card.h2
        a = card.h2.a
        c = card.find('div', class_="company")
        l = card.find('div', class_="location")
        d = card.find('time')
        
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "POSTED TODAY"):
                pass
            else:
                logger.debug(f"too old: [{d.text.strip()}] {n.text.strip()}")
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=2)
        
        # monster returns a lot of garbage,
        # use fuzzywuzzy to ignore anything that isn't reasonably close
        f = fuzz.ratio(what, n.text.strip())
        logger.debug(f"monster: [{f}] {n.text.strip()}")
        if (f < 60): 
            logger.info (f"monster: fuzz<60: {n.text.strip()}")
            continue
        
        # create the job
        j = MonsterJob(n.text.strip())
        j.url = a['href']
        j.company = c.text.strip()
        j.location = l.text.strip()
        j.posted = posted
        j.closes = closes
        j.fuzzyWuzzy = f
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        mJobs.append(j)
        
    return mJobs

In [8]:
# get the jobs from glass door
def getGlassDoorJobs(what, maxAge=1):
    what = re.sub(" ", "-", what)
    request = f"https://www.glassdoor.com/Job/{what}-jobs-SRCH_KO0,24.htm?fromAge={maxAge}&remoteWorkType=1"
    baseURL = "https://www.glassdoor.com"
    headers = {'user-agent': 'Mozilla/5.0'}
    gJobs = []
    
    logger.debug(f"getting jobs from {request}")
    
    # create the request and soup objects
    page = requests.get(request, headers=headers)
    soup = BeautifulSoup(page.text, "html.parser")
    
    for card in soup.find_all('li', class_="react-job-listing"):
        # extract the card details
        n = card.find('a', class_="jobTitle")
        c = card.find('div', class_="jobHeader")
        l = card.find('span', class_='loc')
        d = card.find('div', attrs={'data-test':'job-age'})
        
        age = 0
        if (d.text.strip() != "24h"):
            try:
                ds = re.findall(r"^\d+", d.text.strip())
                age = int(ds[0])
            except:
                logger.debug(f"too old: [{d.text.strip()}] {n.text.strip()}")
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=2)
        
        # glass door returns a lot of garbage,
        # use fuzzywuzzy to ignore anything that isn't reasonably close
        f = fuzz.ratio(what, n.text.strip())
        logger.debug(f"glassd.: [{f}] {n.text.strip()}")
        if (f < 60):
            logger.info (f"glassd: fuzz<60: {n.text.strip()}")
            continue
        
        # create the job
        j = GlassDoorJob(n.text)
        j.company = c.text
        j.location = l.text
        j.url = f"{baseURL}{n['href']}"
        j.posted = posted
        j.closes = closes
        j.fuzzyWuzzy = f
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        gJobs.append(j)
        
    return gJobs

In [9]:
if __name__ == "__main__":

    starttime = datetime.now()
    allJobs = []
    
    logger.info("getting all the jobs")
    
    # Insert tasks into the queue and let them run
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        futures.append(executor.submit(getIndeedJobs, what="salesforce administrator"))
        futures.append(executor.submit(getCareerBuilderJobs, what="salesforce administrator"))
        futures.append(executor.submit(getMonsterJobs, what="salesforce administrator"))
        futures.append(executor.submit(getGlassDoorJobs, what="salesforce administrator"))
        logger.info("four threads added to the executor")
    
        for future in concurrent.futures.as_completed(futures):
            logger.debug(f"adding {len(future.result())} jobs to allJobs")
            for r in future.result():
                allJobs.append(r)
    
    logger.info(f"retrieving details for {len(allJobs)} jobs")
    
    # Create another threadpool for the details
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        logger.info(f"adding {len(allJobs)} threads to the executor")
        for j in allJobs:
            futures.append(executor.submit(j.retrieveDetails))
    
    endtime = datetime.now()
    logger.info(f"time to run: {endtime - starttime}")
    
    # create our csv file for dataloader
    outfile = 'executor.csv'
    with open(outfile, mode='w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['Name','OriginalURL','Description','PostingDate','CloseDate','Stage'])

        # then write each collection of jobs to the file
        for j in allJobs:
            writer.writerow(j.getDataloader())

    logger.info(f"{outfile} created")
    

INFO:jobExecutor:getting all the jobs
INFO:jobExecutor:four threads added to the executor
INFO:jobExecutor:monster: fuzz<60: Team Leader, Engineering & Salesforce
INFO:jobExecutor:monster: fuzz<60: Senior Business System Analyst - Salesforce & CPQ
INFO:jobExecutor:monster: fuzz<60: Help Desk Support Specialist
INFO:jobExecutor:monster: fuzz<60: Front Desk Coordinator
INFO:jobExecutor:monster: fuzz<60: Relationship Management Officer, Wealth Administration
INFO:jobExecutor:monster: fuzz<60: Senior IT Systems Engineer
INFO:jobExecutor:monster: fuzz<60: Sr. Software Engineer
INFO:jobExecutor:monster: fuzz<60: BI Engagement Lead
INFO:jobExecutor:monster: fuzz<60: Western Regional Sales Manager - Commercial
INFO:jobExecutor:monster: fuzz<60: Commercial Associate
INFO:jobExecutor:monster: fuzz<60: Customer Service Representative
INFO:jobExecutor:monster: fuzz<60: Interconnection Project Manager
INFO:jobExecutor:monster: fuzz<60: Senior Engineer - Salesforce & CPQ
INFO:jobExecutor:monster: fu