In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import html2text
from datetime import date
from datetime import time
from datetime import datetime
from datetime import timedelta
import csv
import re
from fuzzywuzzy import fuzz

In [2]:
# define a job object
class Job:
    name = ""
    url = ""
    description = ""
    company = ""
    location = ""
    stage = "New"
    
    def __init__(self, name):
        self.name = name
        # default the posting and closing dates to now + 7
        dt = date.today()
        self.posted = dt.strftime("%m/%d/%Y")
        dt += timedelta(days=7)
        self.closes = dt.strftime("%m/%d/%Y")
        
    # mostly just for quick debugging
    def __str__(self):
        return f'Name: {self.name}, Posted: {self.posted}, Closes: {self.closes}'
    
    def getDataloader(self):
        return [self.name,
               self.url,
               self.description,
               self.posted,
               self.closes,
               self.stage]

In [3]:
# get the jobs from indeed
def getIndeedJobs(what, where="remote", maxAge=1):
    request = f"https://www.indeed.com/jobs?q={what}&l={where}&fromage={maxAge}"
    baseURL = "https://www.indeed.com"
    jobs = []
    
    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")
    
    # for every job card
    for card in soup.find_all('div', attrs={'data-tn-component':'organicJob'}):
        # find the job details
        a = card.find('a', attrs={'data-tn-element':'jobTitle'})
        c = card.find('span', attrs={'class':'company'})
        l = card.find('span', attrs={'class':'location'})
        d = card.find('span', attrs={'class':'date'})
        
        # figure out the dates
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "JUST POSTED" or d.text.strip().upper() == "TODAY"):
                pass
            else:
                continue # just ignore any jobs more than 30 days old
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # create a job object
        j = Job(a.text.strip())
        j.url = f"{baseURL}{a['href']}"
        j.company = c.text.strip()
        j.location = l.text.strip()
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")

        # the full description is on another page,
        # so follow the URL to that!
        p2 = requests.get(j.url)
        s2 = BeautifulSoup(p2.text, "html.parser")
        d = s2.find('div', attrs={'class':'jobsearch-JobComponent-description'})
        j.description = html2text.html2text(d.prettify())

        # finally add the job object to our array of jobs
        jobs.append(j)
        
    return jobs

In [4]:
# get the jobs from career builder
def getCareerBuilderJobs(what, where="", maxAge=1):
    request = f"https://www.careerbuilder.com/jobs?posted={maxAge}&pay=&cat1=&radius=&emp=&cb_apply=false&keywords={what}&location={where}&cb_workhome=true"
    baseURL = "https://www.careerbuilder.com"
    jobs = []

    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")

    for card in soup.find_all('div', class_="data-results-content-parent"):
        # find the particulars
        n = card.find('div', class_="data-results-title")
        a = card.find('a', class_="data-results-content")
        d = card.find('div', class_="data-results-publish-time")
        
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "TODAY"):
                pass
            else:
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # career builder includes a lot of jobs regardless of age, so ignore those
        if (age > maxAge):
            continue
        
        # create the job
        j = Job(n.text.strip())
        j.url = f"{baseURL}{a['href']}"
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")
        
        # then follow the link for the rest of the details
        p2 = requests.get(j.url)
        s2 = BeautifulSoup(p2.text, "html.parser")
        d = s2.find('div',  attrs={'id':'jdp_description'})
        j.description = html2text.html2text(d.div.div.prettify())

        # the company and location are burried in unidentified spans
        x = s2.find('div', class_="data-details")
        x2 = x.find_all('span')
        j.company = x2[0].text
        j.location = x2[1].text

        # finally add the job object to our array of jobs
        jobs.append(j)
    
    return jobs

In [5]:
# get the jobs from monster
def getMonsterJobs(what, maxAge=1):
    request = f"https://www.monster.com/jobs/search/?q={what}&tm={maxAge}"
    baseURL = "https://www.monster.com"
    jobs = []

    # create the request and soup objects
    page = requests.get(request)
    soup = BeautifulSoup(page.text, "html.parser")
    
    for card in soup.find_all('div', class_="flex-row"):
        # extract the card details
        n = card.h2
        a = card.h2.a
        c = card.find('div', class_="company")
        l = card.find('div', class_="location")
        d = card.find('time')
        
        age = 0
        try:
            ds = re.findall(r"^\d+", d.text.strip())
            age = int(ds[0])
        except:
            if (d.text.strip().upper() == "POSTED TODAY"):
                pass
            else:
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # monster returns a lot of garbage,
        # use fuzzywuzzy to ignore anything that isn't reasonably close
        f = fuzz.ratio(what, n.text.strip())
        if (f < 40): 
            continue
            
        # create the job
        j = Job(n.text.strip())
        j.url = a['href']
        j.company = c.text.strip()
        j.location = l.text.strip()
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")

        # then follow the link for the full description
        try:
            p2 = requests.get(j.url)
            s2 = BeautifulSoup(p2.text, "html.parser")
            d = s2.find('div', attrs={'name':'sanitizedHtml'})
            j.description = html2text.html2text(d.prettify())
        except:
            print(f"unable to get description from {j.url}")
            
        # finally add the job object to our array of jobs
        jobs.append(j)
        
    return jobs

In [6]:
# get the jobs from glass door
def getGlassDoorJobs(what, maxAge=1):
    what = re.sub(" ", "-", what)
    request = f"https://www.glassdoor.com/Job/{what}-jobs-SRCH_KO0,24.htm?fromAge={maxAge}&remoteWorkType=1"
    baseURL = "https://www.glassdoor.com"
    headers = {'user-agent': 'Mozilla/5.0'}
    jobs = []

    # create the request and soup objects
    page = requests.get(request, headers=headers)
    soup = BeautifulSoup(page.text, "html.parser")
    
    for card in soup.find_all('li', class_="react-job-listing"):
        # extract the card details
        n = card.find('a', class_="jobTitle")
        c = card.find('div', class_="jobHeader")
        l = card.find('span', class_='loc')
        d = card.find('div', attrs={'data-test':'job-age'})
        
        age = 0
        if (d.text.strip() != "24h"):
            try:
                ds = re.findall(r"^\d+", d.text.strip())
                age = int(ds[0])
            except:
                continue # who knows how old it is?!
        posted = date.today() - timedelta(days=age)
        closes = posted + timedelta(weeks=4)
        
        # glass door returns a lot of garbage,
        # use fuzzywuzzy to ignore anything that isn't reasonably close
        f = fuzz.ratio(what, n.text.strip())
        if (f < 60):
            continue
        
        # create the job
        j = Job(n.text)
        j.company = c.text
        j.location = l.text
        j.url = f"{baseURL}{n['href']}"
        j.posted = posted.strftime("%m/%d/%Y")
        j.closes = closes.strftime("%m/%d/%Y")

        # then follow the link for the full description
        p2 = requests.get(j.url, headers=headers)
        s2 = BeautifulSoup(p2.text, "html.parser")
        d = s2.find('div', attrs={'id':'JobDescriptionContainer'})
        j.description = html2text.html2text(d.prettify())

        # finally add the job object to our array of jobs
        jobs.append(j)
        
    return jobs

In [7]:
starttime = datetime.now()

# create a master list and add each site to that
allJobs = []
print("fetching indeed...")
allJobs.extend(getIndeedJobs("salesforce administrator"))
print(f"job count: {len(allJobs)}")

print("fetching career builder...")
allJobs.extend(getCareerBuilderJobs("salesforce administrator"))
print(f"job count: {len(allJobs)}")

print("fetching monster...")
allJobs.extend(getMonsterJobs("salesforce administrator"))
print(f"job count: {len(allJobs)}")

print("fetching glass door...")
allJobs.extend(getGlassDoorJobs("salesforce administrator"))
print(f"job count: {len(allJobs)}")

endtime = datetime.now()
print(f"running time: {endtime-starttime}")

fetching indeed...
job count: 10
fetching career builder...
job count: 12
fetching monster...
unable to get description from https://job-openings.monster.com/development-communications-assistant-boston-ma-us-randstad/221458657
unable to get description from https://job-openings.monster.com/salesforce-administrator-denver-co-us-k12-inc/de37e6ce-da5e-4f76-ba6a-1da4bb7d7647
unable to get description from https://job-openings.monster.com/salesforce-administrator-colorado-springs-co-us-cherwell-software/c6717e08-0358-4f33-8151-79ca9d6fe285
job count: 30
fetching glass door...
job count: 34
running time: 0:00:23.181201


In [8]:
print(*allJobs, sep='\n')

Name: Part-Time Salesforce Administrator Consultant (Remote Positi..., Posted: 11/25/2020, Closes: 12/23/2020
Name: Salesforce Administrator, Posted: 11/24/2020, Closes: 12/22/2020
Name: SalesForce Administrator, Posted: 11/25/2020, Closes: 12/23/2020
Name: Sales Support Administrator, Posted: 11/25/2020, Closes: 12/23/2020
Name: Salesforce Administrator, Posted: 11/24/2020, Closes: 12/22/2020
Name: Salesforce Developer - Cognisight/GRIPA, Posted: 11/25/2020, Closes: 12/23/2020
Name: Service Contract Administrator, Posted: 11/24/2020, Closes: 12/22/2020
Name: Sr Sales Executive | Healthcare, Posted: 11/24/2020, Closes: 12/22/2020
Name: COMMERCIAL LOAN CLOSING ADMINISTRATOR, Posted: 11/25/2020, Closes: 12/23/2020
Name: Sales Representative - K-12 Education Technology (Southeast/..., Posted: 11/25/2020, Closes: 12/23/2020
Name: Salesforce Administrator - U.S. Telecommute, Posted: 11/24/2020, Closes: 12/22/2020
Name: Field Systems Analyst, Posted: 11/25/2020, Closes: 12/23/2020
Name: Sale

In [10]:
# create our csv file for dataloader
outfile = 'scraper.csv'
with open(outfile, mode='w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Name','OriginalURL','Description','PostingDate','CloseDate','Stage'])
    
    # then write each collection of jobs to the file
    
    for j in allJobs:
        writer.writerow(j.getDataloader())
        
print(f"{outfile} created")
endtime = datetime.now()
print(endtime - starttime)

scraper.csv created
0:46:58.440508
