In [122]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import html2text
from datetime import date
from datetime import time
from datetime import datetime
from datetime import timedelta
import csv
import re
from fuzzywuzzy import fuzz
import threading

In [123]:
# define a job object
class Job(object):
    name = ""
    url = ""
    description = ""
    company = ""
    location = ""
    posted = ""
    closes = ""
    stage = "New"
    
    def __init__(self, name):
        self.name = name
        
        
    # mostly just for quick debugging
    def __str__(self):
        return f'Name: {self.name}\n URL: {self.url}'
    
    def getDataloader(self):
        return [self.name,
               self.url,
               self.description,
               self.posted,
               self.closes,
               self.stage]


In [124]:
# create a subclass for indeed jobs
class IndeedJob(Job):
    def retrieveDetails(self):
        try:
            response = requests.get(self.url)
            soup = BeautifulSoup(response.text, "html.parser")
            div = soup.find('div', attrs={'class':'jobsearch-JobComponent-description'})
            self.description = html2text.html2text(div.prettify())
        except Exception as e:
            print("Indeedjob.retriveDetails exception:")
            print(e)
            print(self)
            
# create a subclass for indeed jobs
class CareerBuilderJob(Job):
    def retrieveDetails(self):
        try:
            # then follow the link for the rest of the details
            response = requests.get(self.url)
            soup = BeautifulSoup(response.text, "html.parser")
            desc = soup.find('div',  attrs={'id':'jdp_description'})
            self.description = html2text.html2text(desc.div.div.prettify())
            
            # the company and location are burried in unidentified spans
            details = soup.find('div', class_="data-details")
            spans = details.find_all('span')
            self.company = spans[0].text
            self.location = spans[1].text
        except Exception as e:
            print("CareerBuilderJob.retriveDetails exception:")
            print(e)
            print(self)
            
# create a subclass for indeed jobs
class MonsterJob(Job):
    def retrieveDetails(self):
        try:
            response = requests.get(self.url)
            soup = BeautifulSoup(response.text, "html.parser")
            div = soup.find('div', attrs={'name':'sanitizedHtml'})
            self.description = html2text.html2text(div.prettify())
        except Exception as e:
            print("MonsterJob.retriveDetails exception:")
            print(e)
            print(self)

            
# create a subclass for indeed jobs
class GlassDoorJob(Job):
    def retrieveDetails(self):
        try:
            hdrs = {'user-agent': 'Mozilla/5.0'}
            reponse = requests.get(self.url, headers=hdrs)
            soup = BeautifulSoup(reponse.text, "html.parser")
            div = soup.find('div', attrs={'id':'JobDescriptionContainer'})
            self.description = html2text.html2text(div.prettify())
        except Exception as e:
            print("GlassDoorJob.retriveDetails exception:")
            print(e)
            print(self)
            


In [125]:
# get the jobs from indeed
def getIndeedJobs(results, what, maxAge):
    request = f"https://www.indeed.com/jobs?q={what}&l=remote&fromage={maxAge}"
    baseURL = "https://www.indeed.com"
    
    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")
    
    # for every job card
    for card in soup.find_all('div', attrs={'data-tn-component':'organicJob'}):
        # find the job details
        a = card.find('a', attrs={'data-tn-element':'jobTitle'})
        c = card.find('span', attrs={'class':'company'})
        l = card.find('span', attrs={'class':'location'})
        d = card.find('span', attrs={'class':'date'})
        
        # figure out the dates
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "JUST POSTED" or d.text.strip().upper() == "TODAY"):
                pass
            else:
                continue # just ignore any jobs more than 30 days old
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # create a job object
        j = IndeedJob(a.text.strip())
        j.url = f"{baseURL}{a['href']}"
        j.company = c.text.strip()
        j.location = l.text.strip()
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        results.append(j)

In [126]:
# get the jobs from career builder
def getCareerBuilderJobs(results, what, maxAge):
    request = f"https://www.careerbuilder.com/jobs?posted={maxAge}&pay=&cat1=&radius=&emp=&cb_apply=false&keywords={what}&location=&cb_workhome=true"
    baseURL = "https://www.careerbuilder.com"
    
    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")

    for card in soup.find_all('div', class_="data-results-content-parent"):
        # find the particulars
        n = card.find('div', class_="data-results-title")
        a = card.find('a', class_="data-results-content")
        d = card.find('div', class_="data-results-publish-time")
        
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "TODAY"):
                pass
            else:
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # career builder includes a lot of jobs regardless of age, so ignore those
        if (age > maxAge):
            continue
        
        # create the job
        j = CareerBuilderJob(n.text.strip())
        j.url = f"{baseURL}{a['href']}"
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        results.append(j)

In [127]:
# get the jobs from monster
def getMonsterJobs(results, what, maxAge):
    request = f"https://www.monster.com/jobs/search/?q={what}&tm={maxAge}"
    baseURL = "https://www.monster.com"
    
    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")
    
    for card in soup.find_all('div', class_="flex-row"):
        # extract the card details
        n = card.h2
        a = card.h2.a
        c = card.find('div', class_="company")
        l = card.find('div', class_="location")
        d = card.find('time')
        
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "POSTED TODAY"):
                pass
            else:
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # monster returns a lot of garbage,
        # use fuzzywuzzy to ignore anything that isn't reasonably close
        f = fuzz.ratio(what, n.text.strip())
        if (f < 50): 
            #print(f"{f}: ignoring {n.text.strip()}")
            continue
        
        # create the job
        j = MonsterJob(n.text.strip())
        j.url = a['href']
        j.company = c.text.strip()
        j.location = l.text.strip()
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        results.append(j)

In [128]:
# get the jobs from glass door
def getGlassDoorJobs(results, what, maxAge):
    what = re.sub(" ", "-", what)
    request = f"https://www.glassdoor.com/Job/{what}-jobs-SRCH_KO0,24.htm?fromAge={maxAge}&remoteWorkType=1"
    baseURL = "https://www.glassdoor.com"
    headers = {'user-agent': 'Mozilla/5.0'}

    # create the request and soup objects
    page = requests.get(request, headers=headers)
    soup = BeautifulSoup(page.text, "html.parser")
    
    for card in soup.find_all('li', class_="react-job-listing"):
        # extract the card details
        n = card.find('a', class_="jobTitle")
        c = card.find('div', class_="jobHeader")
        l = card.find('span', class_='loc')
        d = card.find('div', attrs={'data-test':'job-age'})
        
        age = 0
        if (d.text.strip() != "24h"):
            try:
                ds = re.findall(r"^\d+", d.text.strip())
                age = int(ds[0])
            except:
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # glass door returns a lot of garbage,
        # use fuzzywuzzy to ignore anything that isn't reasonably close
        f = fuzz.ratio(what, n.text.strip())
        if (f < 60):
            #print(f"{f}: ignoring {n.text.strip()}")
            continue
        
        # create the job
        j = GlassDoorJob(n.text)
        j.company = c.text
        j.location = l.text
        j.url = f"{baseURL}{n['href']}"
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")
        # get the rest of the details in another thread
        # just add the job object to our array of jobs
        results.append(j)

In [129]:
if __name__ == "__main__":

    starttime = datetime.now()
    allJobs = []
    
    print("getting jobs")
    
    # Create a pool with four worker threads
    pool = []

    # Insert tasks into the queue and let them run
    pool.append(threading.Thread(target=getIndeedJobs, args=(allJobs, "salesforce administrator", 1)))
    pool.append(threading.Thread(target=getCareerBuilderJobs, args=(allJobs, "salesforce administrator", 1)))
    pool.append(threading.Thread(target=getMonsterJobs, args=(allJobs, "salesforce administrator", 1)))
    pool.append(threading.Thread(target=getGlassDoorJobs, args=(allJobs, "salesforce administrator", 1)))
    
    # start all the threads
    for thread in pool:
        thread.start()
    
    # wait for the threads
    for thread in pool:
        thread.join()
    
    print(f"found {len(allJobs)} jobs, retrieving details")
    # create a new pool to retrieve the details of every job
    pool = []
    for j in allJobs:
        pool.append(threading.Thread(target=j.retrieveDetails))
    
    print(f"pool has {len(pool)} elements")
    # start all the threads
    for thread in pool:
        thread.start()
    
    # wait for the threads
    for thread in pool:
        thread.join()
        
    endtime = datetime.now()
    print(f"time to run: {endtime - starttime}")
    print(f"job count: {len(allJobs)}")
    
    #print(*allJobs, sep='\n')
    
    # create our csv file for dataloader
    outfile = 'threading.csv'
    with open(outfile, mode='w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['Name','OriginalURL','Description','PostingDate','CloseDate','Stage'])

        # then write each collection of jobs to the file

        for j in allJobs:
            writer.writerow(j.getDataloader())

    print(f"{outfile} created")

getting jobs
found 23 jobs, retrieving details
pool has 23 elements
MonsterJob.retriveDetails exception:
'NoneType' object has no attribute 'prettify'
Name: Salesforce Administrator
 URL: https://job-openings.monster.com/salesforce-administrator-denver-co-us-k12-inc/de37e6ce-da5e-4f76-ba6a-1da4bb7d7647
MonsterJob.retriveDetails exception:
'NoneType' object has no attribute 'prettify'
Name: Salesforce Administrator
 URL: https://job-openings.monster.com/salesforce-administrator-colorado-springs-co-us-cherwell-software/c6717e08-0358-4f33-8151-79ca9d6fe285
time to run: 0:00:04.298315
job count: 23
threading.csv created
