In [None]:
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import html2text
from datetime import date
from datetime import time
from datetime import datetime
from datetime import timedelta
import csv
import re
from fuzzywuzzy import fuzz
import threading
import concurrent.futures
from dataclasses import dataclass
from pympler import asizeof
import copy
from stem import Signal
from stem.control import Controller
from fake_useragent import UserAgent
import psutil
import time
import random
import os


In [None]:
# setup the logging
# level = DEBUG < INFO < ERROR < WARNING < CRITICAL
import sys
import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger('\tJobScraper')


In [None]:
# setup some defaults to use everywhere
defMaxAge = 1
defMinFuzzScore = 60
defRegex = (" ", "%20")

searchTerms = ['salesforce administrator', 'junior salesforce', 'jr salesforce', 'entry level salesforce']

outfile = 'slots.csv'
#torPorts = None
#torPorts = ['9050']
torPorts = ['9050', '9150', '9250', '9350']

In [None]:
# define a config object
@dataclass
class ScraperConfig:
    __slots__ = ['searchTerm', 'maxAge', 'searchURL', 
                 'minFuzzScore', 'regex', 'baseURL', 'headers', 'proxies', 
                 'jobCardSelector', 'nameSelector', 'hrefSelector', 'dateSelector', 'companySelector', 
                 'locationSelector', 'detailSelector']
    searchTerm: str
    maxAge: int
    searchURL: str
    minFuzzScore: int
    regex: tuple
    baseURL: str
    headers: dict
    proxies: dict
    jobCardSelector: str
    nameSelector: str
    hrefSelector: str
    dateSelector: str
    companySelector: str
    locationSelector: str
    detailSelector: str


In [None]:
# define a job object
@dataclass
class Job:
    __slots__ = ['name', 'url', 'description', 'company', 'location', 
                 'posted', 'closes', 'stage', 'fuzzyWuzzy', 'config', 'daysOld']
    name: str
    url: str
    description: str
    company: str
    location: str
    posted: datetime.date
    closes: datetime.date
    stage: str
    fuzzyWuzzy: int
    config: ScraperConfig
    daysOld: int
    
    # mostly just for quick debugging
    def __str__(self):
        return f'Name: {self.name}\n URL: {self.url}'
    
    
    # get the remaining details from the job's url field
    def retrieveDetails(self):
        # slow the thread down by waiting a while
        sleeptime = random.randint(20,120)
        logger.info(f"\tretrieveDetails:\tsleeping {sleeptime}s before getting details")
        time.sleep(sleeptime)
            
        # get the actual job page
        logger.debug(f"\tretrieveDetails:\tgetting {self.url}\nusing headers {self.config.headers}")
        page = requests.get(self.url, headers=self.config.headers, proxies=self.config.proxies)
        soup = BeautifulSoup(page.text, "html.parser")
        
        # parse out the remaining details
        logger.debug(f"\tretrieveDetails:\tlooking for '{self.config.locationSelector}' and '{self.config.detailSelector}'")
        self.location = soup.select_one(self.config.locationSelector).text.strip()
        self.description = html2text.html2text(soup.select_one(self.config.detailSelector).prettify())
        
        #logger.debug(f"\tretrieveDetails:\n{self.name}\n{self.location}\n{self.description}\n")
        return
    
    
    # returns a list for a CSV file
    def getDataloader(self):
        return [self.name,
               self.url,
               self.description,
               self.posted.strftime("%m/%d/%Y"),
               self.closes.strftime("%m/%d/%Y"),
               self.stage]


In [None]:
# define the job scrapers
@dataclass
class JobScraper(object):
    __slots__ = ['config']
    config: ScraperConfig
        
    def getJobs(self):
        jobs = []
        searchTerm = ""
        
        # apply the regex to the search term(s) if necessary
        try:
            searchTerm = re.sub(self.config.regex[0], self.config.regex[1], self.config.searchTerm)

        except:
            searchTerm = self.config.searchTerm
            
        # get the page
        try:
            searchURL = self.config.searchURL.format(searchTerm, self.config.maxAge)
            logger.debug(f"\tgetJobs:\t {searchURL}")
            page = requests.get(searchURL, headers=self.config.headers, proxies=self.config.proxies)
            soup = BeautifulSoup(page.text, "html.parser")
            cards = soup.select(self.config.jobCardSelector)
        except Exception as e:
            logger.error(f"\tgetJobs:\t unable to retrieve jobs from {searchURL}\n{e}")
            return jobs
        
        # get all the jobs
        for card in cards:
            try:
                name = card.select_one(self.config.nameSelector).text.strip()
                fuzzy = fuzz.ratio(self.config.searchTerm, name)
                href = self.config.baseURL + card.select_one(self.config.hrefSelector)['href']
                postedRelative = card.select_one(self.config.dateSelector).text.strip().upper()
                company = card.select_one(self.config.companySelector).text.strip()

                # skip it if the fuzz score is too low
                if (fuzzy < self.config.minFuzzScore):
                    logger.info(f"\tgetJobs:\tignoring [{fuzzy}]\t{name}")
                    continue

                logger.debug(f"\tgetJobs:\ttrying to parse {postedRelative}")
                daysOld = 9999
                try:
                    if (postedRelative in ['TODAY', 'JUST POSTED', 'POSTED TODAY', '24H']):
                        daysOld = 0
                    else:
                        i = re.findall(r"^\d+", postedRelative)
                        daysOld = int(i[0])
                except:
                    logger.error(f"\tgetJobs:\tunable to parse {postedRelative}")

                # skip it if it's older than the max age
                if (daysOld > self.config.maxAge):
                    logger.info(f"\tgetJobs:\tignoring {daysOld} days old - {name}")
                    continue

                j = Job(name = name, 
                        url = href, 
                        description = None, 
                        company = company, 
                        location = None,
                        posted = date.today() - timedelta(days=daysOld), 
                        closes = date.today() - timedelta(days=daysOld) + timedelta(days=14), 
                        stage = "new", 
                        fuzzyWuzzy = fuzzy, 
                        config = self.config, 
                        daysOld = daysOld)
                
                j.retrieveDetails()
                jobs.append(j)
                
            except Exception as e:
                logger.info(f"\tgetJobs:\tException: {e}")
            
        return jobs
    

In [None]:
# create a configuration for each website you want to search
configs = []

# indeed
configs.append(ScraperConfig(
    searchTerm = "generic indeed config",
    maxAge = defMaxAge,
    searchURL = "https://www.indeed.com/jobs?q={0}&l=remote&fromage={1}",
    
    minFuzzScore = defMinFuzzScore,
    regex = defRegex,
    baseURL = "https://www.indeed.com",
    headers = None,
    proxies = None,
    
    jobCardSelector = ".result",
    nameSelector = ".jobtitle",
    hrefSelector = ".jobtitle",
    dateSelector = ".date",
    companySelector = "span.company",
    
    locationSelector = ".icl-u-xs-mt--xs > div > div:nth-of-type(3)",
    detailSelector = "#jobDescriptionText"
))

# career builder
configs.append(ScraperConfig(
    searchTerm = "generic career builder config", 
    maxAge = defMaxAge,
    searchURL = "https://www.careerbuilder.com/jobs?posted={1}&pay=&cat1=&radius=&emp=&cb_apply=false&keywords={0}&location=&cb_workhome=true",
    
    minFuzzScore = defMinFuzzScore,
    regex = defRegex,
    baseURL = "https://www.careerbuilder.com",
    headers = None,
    proxies = None,
    
    jobCardSelector = ".data-results-content-parent",
    nameSelector = ".data-results-title",
    hrefSelector = "a.job-listing-item",
    dateSelector = ".data-results-publish-time",
    companySelector = ".data-details > span:nth-of-type(1)",
    
    locationSelector = ".data-details > span:nth-of-type(3)",
    detailSelector = "#jdp_description > div:nth-of-type(1) > div:nth-of-type(1)"
))

# monster
configs.append(ScraperConfig(
    searchTerm = "generic monster config",
    maxAge = defMaxAge,
    searchURL = "https://www.monster.com/jobs/search/?q={0}&tm={1}",
    
    minFuzzScore = defMinFuzzScore,
    regex = defRegex,
    baseURL = "",
    headers = None,
    proxies = None,
    
    jobCardSelector = "div.flex-row",
    nameSelector = ".title",
    hrefSelector = ".title > a",
    dateSelector = "time",
    companySelector = "div.company",
    
    locationSelector = "div.location",
    detailSelector = "div.job-description"
))

# glass door
configs.append(ScraperConfig(
    searchTerm = "generic glass door config",
    maxAge = defMaxAge,
    searchURL = "https://www.glassdoor.com/Job/{0}-jobs-SRCH_KO0,24.htm?fromAge={1}&remoteWorkType=1",
    
    minFuzzScore = defMinFuzzScore,
    regex = (" ", "-"),
    baseURL = "https://www.glassdoor.com",
    headers = None,
    proxies = None,
    
    jobCardSelector = "li.jl",
    nameSelector = "a.jobTitle > span",
    hrefSelector = "a.jobTitle",
    dateSelector = "div[data-test='job-age']",
    companySelector = "a.jobLink > span",
    
    locationSelector = "div.flex-column > div > div:nth-of-type(3)",
    detailSelector = "div.desc"
))


In [None]:
# returns a dictionary for the specified port
def getProxies(port=None):
    if (port == None):
        return None
    
    return { 'http': f'socks5://127.0.0.1:{port}',
            'https': f'socks5://127.0.0.1:{port}' }

In [None]:
# actually do the job scraping now!
def main():
    allJobs = []
    scrapers = []
    
    # while there are still things to search for
    while len(searchTerms) > 0:
        logger.debug(f"\tmain:\touter: {len(searchTerms)} terms remaining")
        
        # create a thread pool for each search, site, and proxy combo
        # Insert tasks into the queue and let them run
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            
            # for each of the available ports
            for port in torPorts:
                
                # and each of the terms
                if len(searchTerms) > 0:
                    search = searchTerms.pop()
                    
                    # for each of the configs
                    for c in copy.deepcopy(configs):
                        logger.debug(f"\tmain:\tinner:\tbase {c.baseURL}, search {search}, port {port}")
                        # fill in the config and create the scraper
                        c.searchTerm = search
                        c.proxies = getProxies(port)
                        c.headers = {'User-Agent':UserAgent().random}
                        js = JobScraper(c)
                        
                        # create a thread to fill in the job info
                        futures.append(executor.submit(js.getJobs))
                        
            # if any of the ports are going to get reused,
            if len(searchTerms) > 0:
                # then get new IPs for each port
                    with Controller.from_port(port = 9051) as c:
                        c.authenticate()
                        c.signal(Signal.NEWNYM)
                        for port in torPorts:
                            newIP = requests.get("https://ident.me", proxies=getProxies(port)).text
                            logger.info(f"\tgetJobs:\tnew IP: {newIP}")

            # as threads complete, add their jobs to the allJobs list
            for future in concurrent.futures.as_completed(futures):
                logger.info(f"\tmain:\tadding {len(future.result())} jobs to allJobs")
                
                for r in future.result():
                    logger.debug(f"\tmain:\tadding {r.name} to allJobs")
                    allJobs.append(r)
                    
    return allJobs;

In [None]:
def saveJobs(allJobs):
    newFile = not(os.path.isfile(outfile) and os.path.getsize(outfile) > 0)
    
    # create our csv file for dataloader
    with open(outfile, mode='a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        if newFile:
            writer.writerow(['Name','OriginalURL','Description','PostingDate','CloseDate','Stage'])

        # then write each collection of jobs to the file
        for j in allJobs:
            writer.writerow(j.getDataloader())

    logger.info(f"\tmain:\t{outfile} created")
    

In [None]:
# actually do the job scraping now!
if __name__ == "__main__":
    starttime = datetime.now()
    allJobs = main()
    saveJobs(allJobs)
    endtime = datetime.now()
    logger.info(f"\ttime to run: {endtime - starttime}")
    