In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import html2text
from datetime import date
from datetime import time
from datetime import datetime
from datetime import timedelta
import csv
import re
from fuzzywuzzy import fuzz
import threading
import concurrent.futures

In [2]:
# setup the logging
# level = DEBUG < INFO < ERROR < WARNING < CRITICAL
import sys
import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger('\tjobExecutor')

In [3]:
# define a job object
class Job(object):
    name = ""
    url = ""
    description = ""
    company = ""
    location = ""
    posted = date.today()
    closes = date.today()
    stage = "New"
    fuzzyWuzzy = 0
    config = {}
    daysOld = 0
    
    def __init__(self, name):
        self.name = name
        logger.debug(f'\tJob:\t"{self.name}" created')
        
        
    # mostly just for quick debugging
    def __str__(self):
        return f'Name: {self.name}\n URL: {self.url}'
    
    
    # get the remaining details from the job's url field
    def retrieveDetails(self):
        # get the actual job page
        logger.debug(f"\tretrieveDetails:\tgetting {self.url}\nusing headers {self.config['headers']}")
        page = requests.get(self.url, headers=self.config['headers'])
        soup = BeautifulSoup(page.text, "html.parser")
        
        # parse out the remaining details
        logger.debug(f"\tretrieveDetails:\tlooking for '{self.config['locationSelector']}' and '{self.config['detailSelector']}'")
        self.location = soup.select_one(self.config['locationSelector']).text.strip()
        self.description = html2text.html2text(soup.select_one(self.config['detailSelector']).prettify())
        #logger.debug(f"\tretrieveDetails:\n{self.name}\n{self.location}\n{self.description}\n")
        return
    
    
    # returns a list for a CSV file
    def getDataloader(self):
        return [self.name,
               self.url,
               self.description,
               self.posted.strftime("%m/%d/%Y"),
               self.closes.strftime("%m/%d/%Y"),
               self.stage]


In [4]:
# get the jobs from glass door
class JobScraper(object):
    config = {}
    
    def __init__(self, **kwargs):
        #logger.debug(kwargs)
        self.config = kwargs
        
    def getJobs(self):
        # apply the regex to the search term(s) if necessary
        searchTerm = ""
        try:
            searchTerm = re.sub(self.config['regex'][0], self.config['regex'][1], self.config['searchTerm'])

        except:
            searchTerm = self.config['searchTerm']
            
        # get the page
        searchURL = self.config['searchURL'].format(searchTerm, self.config['maxAge'])
        logger.debug(f"\tgetJobs:\t {searchURL}")
        page = requests.get(searchURL, headers=self.config['headers'])
        soup = BeautifulSoup(page.text, "html.parser")
        
        # get all the jobs
        jobs = []
        for card in soup.select(self.config['jobCardSelector']):
            name = card.select_one(self.config['nameSelector']).text.strip()
            fuzzy = fuzz.ratio(self.config['searchTerm'], name)
            href = self.config['baseURL'] + card.select_one(self.config['hrefSelector'])['href']
            postedRelative = card.select_one(self.config['dateSelector']).text.strip().upper()
            company = card.select_one(self.config['companySelector']).text.strip()
            
            # skip it if the fuzz score is too low
            if (fuzzy < self.config['minFuzzScore']):
                logger.info(f"\tgetJobs:\tignoring [{fuzzy}]\t{name}")
                continue

            logger.debug(f"\tgetJobs:\ttrying to parse {postedRelative}")
            daysOld = 9999
            try:
                if (postedRelative in ['TODAY', 'JUST POSTED', 'POSTED TODAY', '24H']):
                    daysOld = 0
                else:
                    i = re.findall(r"^\d+", postedRelative)
                    daysOld = int(i[0])
            except:
                logger.error(f"\tgetJobs:\tunable to parse {postedRelative}")
                
            # skip it if it's older than the max age
            if (daysOld > self.config['maxAge']):
                logger.info(f"\tgetJobs:\tignoring {daysOld} days old - {name}")
                continue
            
            j = Job(name)
            j.fuzzyWuzzy = fuzzy
            j.url = href
            j.posted = date.today() - timedelta(days=daysOld)
            j.closes = j.posted + timedelta(days=14)
            j.company = company
            j.config = self.config
            jobs.append(j)
            
        return jobs
    
    

In [5]:
# create a configuration for each search you want to do
configs = []

# indeed
configs.append({
    "searchTerm":"salesforce administrator",
    "maxAge":2,
    "searchURL":"https://www.indeed.com/jobs?q={0}&l=remote&fromage={1}",
    
    "minFuzzScore":60,
    "regex":(" ", "%20"),
    "baseURL":"https://www.indeed.com",
    "headers":{'user-agent':'Mozilla/5.0'},
    
    "jobCardSelector":".result",
    "nameSelector":".jobtitle",
    "hrefSelector":".jobtitle",
    "dateSelector":".date",
    "companySelector":"span.company",
    
    "locationSelector":".icl-u-xs-mt--xs > div > div:nth-of-type(3)",
    "detailSelector":"#jobDescriptionText"
})

# career builder
configs.append({
    "searchTerm":"salesforce administrator",
    "maxAge":2,
    "searchURL":"https://www.careerbuilder.com/jobs?posted={1}&pay=&cat1=&radius=&emp=&cb_apply=false&keywords={0}&location=&cb_workhome=true",
    
    "minFuzzScore":60,
    "regex":(" ", "%20"),
    "baseURL":"https://www.careerbuilder.com",
    "headers":{'user-agent':'Mozilla/5.0'},
    
    "jobCardSelector":".data-results-content-parent",
    "nameSelector":".data-results-title",
    "hrefSelector":"a.job-listing-item",
    "dateSelector":".data-results-publish-time",
    "companySelector":".data-details > span:nth-of-type(1)",
    
    "locationSelector":".data-details > span:nth-of-type(3)",
    "detailSelector":"#jdp_description > div:nth-of-type(1) > div:nth-of-type(1)"
})

# monster
configs.append({
    "searchTerm":"salesforce administrator",
    "maxAge":2,
    "searchURL":"https://www.monster.com/jobs/search/?q={0}&tm={1}",
    
    "minFuzzScore":60,
    "regex":(" ", "%20"),
    "baseURL":"",
    "headers":{'user-agent':'Mozilla/5.0'},
    
    "jobCardSelector":"div.flex-row",
    "nameSelector":".title",
    "hrefSelector":".title > a",
    "dateSelector":"time",
    "companySelector":"div.company",
    
    "locationSelector":"div.location",
    "detailSelector":"div.job-description"
})

# glass door
configs.append({
    "searchTerm":"salesforce administrator",
    "maxAge":2,
    "searchURL":"https://www.glassdoor.com/Job/{0}-jobs-SRCH_KO0,24.htm?fromAge={1}&remoteWorkType=1",
    
    "minFuzzScore":60,
    "regex":(" ", "-"),
    "baseURL":"https://www.glassdoor.com",
    "headers":{'user-agent':'Mozilla/5.0'},
    
    
    "jobCardSelector":"li.jl",
    "nameSelector":"a.jobTitle > span",
    "hrefSelector":"a.jobTitle",
    "dateSelector":"div[data-test='job-age']",
    "companySelector":"a.jobLink > span",
    
    "locationSelector":"div.flex-column > div > div:nth-of-type(3)",
    "detailSelector":"div.desc"
})



In [6]:
if __name__ == "__main__":

    starttime = datetime.now()
    allJobs = []
    scrapers = []
    
    for c in configs:
        scrapers.append(JobScraper(**c))
        
    logger.info(f"\tmain:\t{len(scrapers)} scrapers created")
    
    # Insert tasks into the queue and let them run
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for s in scrapers:
            futures.append(executor.submit(s.getJobs))
        
        for future in concurrent.futures.as_completed(futures):
            logger.debug(f"\tmain:\tadding {len(future.result())} jobs to allJobs")
            for r in future.result():
                allJobs.append(r)
    
    logger.info(f"\tmain:\tretrieving details for {len(allJobs)} jobs")
    
    # Create another threadpool for all the details
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        logger.info(f"\tmain:\tadding {len(allJobs)} threads to the executor")
        for j in allJobs:
            futures.append(executor.submit(j.retrieveDetails))
    
    endtime = datetime.now()
    logger.info(f"\tmain:\ttime to run: {endtime - starttime}")
    
    # create our csv file for dataloader
    outfile = 'classy.csv'
    with open(outfile, mode='w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['Name','OriginalURL','Description','PostingDate','CloseDate','Stage'])

        # then write each collection of jobs to the file
        for j in allJobs:
            writer.writerow(j.getDataloader())

    logger.info(f"\tmain:\t{outfile} created")
    

INFO:	jobExecutor:	main:	4 scrapers created
INFO:	jobExecutor:	getJobs:	ignoring [59]	Marketing Operations Administrator
INFO:	jobExecutor:	getJobs:	ignoring [45]	LMS Administrator, Program Manager
INFO:	jobExecutor:	getJobs:	ignoring [48]	Financial Force Consultant
INFO:	jobExecutor:	getJobs:	ignoring [29]	School Success Associate (South-East Region) - Always hiring
INFO:	jobExecutor:	getJobs:	ignoring [25]	Technical Program Manager - Sales & Marketing Technology (Re...
INFO:	jobExecutor:	getJobs:	ignoring [32]	Senior Enterprise Implementation Consultant - Remote, US
INFO:	jobExecutor:	getJobs:	ignoring [28]	Account Executive (Midwest Region)
INFO:	jobExecutor:	getJobs:	ignoring [34]	Territory Manager, Acute Pain
INFO:	jobExecutor:	getJobs:	ignoring [30]	Account Consultant, Digestive Health
INFO:	jobExecutor:	getJobs:	ignoring [43]	Salesforce CPQ Architect - Remote working USA
INFO:	jobExecutor:	getJobs:	ignoring [59]	Salesforce Architect
INFO:	jobExecutor:	getJobs:	ignoring [36]	Reve

FileNotFoundError: [Errno 2] No such file or directory: '~/Desktop/ClassyJobs.csv'

INFO:	jobExecutor:	main:	classy.csv created
