In [None]:
#Import Libraries
from bs4 import BeautifulSoup
from html2text import html2text
from tqdm import tqdm_notebook

import pandas as pd
import requests
import time

## Functions

In [None]:
#Extract Job URLs
"""
Input: soup object containing the result of a job query (e.g. Data Scientist jobs in New York)
Output: list of the individual job urls of the jobs found in the query
"""
def extract_job_urls_from_result(query_soup): 
    urls = []
    for div in query_soup.find_all(name = "div", attrs = {"class":"row"}):
        for a in div.find_all(name = "a", attrs = {"data-tn-element":"jobTitle"}):
            this_url = a['href']
            to_go_url = "https://www.indeed.com/viewjob" + this_url[7:]
            urls.append(to_go_url)
    return(urls)

In [None]:
#Extract Job Title & Description
"""
Input: The full URL of a job that will be scraped
Output: A tuple containing the job title & the description, after applying some basic cleaning
        On error, returns None.
"""
def extract_text_from_jobURL(url):
    #Request the job url and get a soup object reference to its contents
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    
    """There are two different methods Indeed.com uses to list its jobs, so we
    need to adapt. To check which is used, we will first get the job title. """
    
    #Method One - the job title is in a <b> tag
    title = soup.find('b', 'jobtitle')
    if title != None:
        
        #Retrieve the job description
        table = soup.find('table', id = 'job-content')
        span = table.find('span', id = 'job_summary')

        description = span.find('div') #Usually found inside this span's div
        if description == None: #But in some posts there doesn't exist the div class
            description = span
    
    #Method Two - the job title is in a <h3> tag
    else:
        title = soup.find('h3', 'icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title')
        if title != None:
            
            #Retrieve the job description
            description = soup.find('div', 'jobsearch-JobComponent-description icl-u-xs-mt--md')
    
    #Checking complete, now return the result after applying basic cleaning to the description.
    if title is not None:
        title = html2text(str(title)).strip().replace('*', '').replace('#', '')
        clean_desc = ' '.join(html2text(str(description)).replace('*','').replace('#', '').split())

        return(title, clean_desc)
    else:
        #print("Other HTML format was used! >:| ")
        return(None)

## Execution Parameters

In [None]:
job_titles_path = "..\..\Datasets\job_titles_IT.csv"
results_path = "..\..\Results\TempResults.csv"
urls_path = "..\..\Results\setURLs.csv"

#job_titles_path = "..\Datasets\Best Jobs in America.csv" #source: https://money.cnn.com/pf/best-jobs/2017/list/index.html

city_list = ['New+York', 'Los+Angeles', 'Chicago', 'Houston',
             'Washington', 'Dallas', 'Seattle', 'Silicon+Valley',
             'Detroit', 'San+Francisco', 'Austin', 'Philadelphia',
             'Boston', 'Minneapolis', 'Phoenix', 'San+Jose']

max_jobs_to_get = None #optional, used to set total # of jobs to download, else leave None
jobs_perQuery_perCity = 25 #must be <= 50, take into account # of inaccessible ads

jobs_stored = 9800 #total number of jobs stored in the .csv
queries_completed = jobs_stored // jobs_perQuery_perCity #used to resume downloading
append_mode = True #control whether the .csv exists and should the results be appended

checkpoint_interval = 100 #how often to store the results into the .csv
allow_duplicates = False #checks for duplicate jobs (over each program run only)

## Program Execution

In [None]:
#Import the Job Titles Data
job_titles = pd.read_csv(job_titles_path, sep = ",")

#Create job titles list
job_list = list(job_titles.Title)

In [None]:
#Create queries URLs - a list of (query_job, query_url) tuples
queries_list = []

for query_job in job_list:
    for city in city_list:
#        for start in range(0, 150, 50):
        query_url = "http://www.indeed.com/jobs?q=" + query_job.replace(' ', '+') + \
                    "%2420%2C000&l=" + str(city) + \
                    "&limit=50" #+ "&start=" + str(start)
        queries_list.append((query_job, query_url))

#Initial Setup
df = pd.DataFrame(columns = ["ID", "Query", "Job Title", "Description"])


if max_jobs_to_get is None: #set number of jobs to be downloaded (in the end result)
    max_jobs_to_get = jobs_perQuery_perCity * len(queries_list)
print("max_jobs_to_get :", max_jobs_to_get)

if (not allow_duplicates):
    if jobs_stored == 0: #no URLs stored either
        urls_df = pd.DataFrame(columns = ["ID", "URL"])
        visited = set()
        urls_df.to_csv(urls_path, index = False)
    else: #load the previously stored jobs' URLs (to avoid re-storing them)
        urls_df = pd.read_csv(urls_path, sep = ",")
        visited = set(list(urls_df.URL))

### Main Loop

In [None]:
#Jupyter Progress Bar init
pbar = tqdm_notebook(initial = jobs_stored, total = max_jobs_to_get, desc = 'Loading Jobs')

#Outer loop - go over the query results (many jobs in each)
for query_job, query_url in queries_list[queries_completed:]:

    if jobs_stored != max_jobs_to_get:
        #Send a request over the query URL and get a BeautifulSoup object out of it:
        try:
            page = requests.get(query_url, timeout = 300)
            soup = BeautifulSoup(page.text, "html.parser")
        except OSError:
            print("OSError was caught at query retrieval! Saving and exiting...")
            if (not append_mode): #first time saving
                df.to_csv(results_path, index = False)
                append_mode = True
            else:
                df.to_csv(results_path, mode = 'a', index = False, header = False)
            break

        #Retrieve the specific job urls from this listing
        job_urls = extract_job_urls_from_result(soup)

        #Check if there is a job in indeed
        if job_urls != []: #found some jobs for this query
            
            query_jobs_stored = 0 #the jobs found for this specific query
            
            #Inner loop - retrieve the title and the description for each job found
            for item in job_urls:
                
                if query_jobs_stored == jobs_perQuery_perCity: #collected enough jobs
                    break
                
                if item in visited: #already captured, ignore
                    continue

                #Extract job info (title and description)
                try:
                    job_info = extract_text_from_jobURL(item)
                    #time.sleep(0.2) #Add a slight delay to avoid getting blocked
                except OSError: #network error or timeout
                    print("OSError was caught and ignored!")
                    continue

                if job_info is not None: #all good
                    query_jobs_stored += 1
                    jobs_stored += 1

                    #store the job details in the dataframe
                    df.loc[len(df)] = [jobs_stored, query_job, job_info[0], job_info[1]]
                    
                    if (not allow_duplicates):
                        visited.add(item) #mark it as visited so to not store it again
                        urls_df.loc[len(urls_df)] = [jobs_stored, item]
                    
                    pbar.update(1) #update progress bar
                else:
                    print("Job retrieval failed for url:", item)
                    continue
                    
                if ((jobs_stored % checkpoint_interval) == 0): #if it's time to save
                    print("Storing the results...Total saved up to now:", jobs_stored)
                    if (not append_mode): #first time saving
                        df.to_csv(results_path, index = False)
                        append_mode = True
                    else:
                        df.to_csv(results_path, mode = 'a', index = False, header = False)
                    df = df.iloc[0:0] #empty contents in memory
                    if (not allow_duplicates):
                        urls_df.to_csv(urls_path, mode = 'a', index = False, header = False)
                        urls_df = urls_df.iloc[0:0]
                        
    else: #all downloaded, stop
        pbar.close()
        break

pbar.close()