In [1]:
#Import Libraries
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

import pandas as pd
import requests

## Functions

In [2]:
#Extract Job URLs
"""
Input: soup object containing the result of a job query (e.g. Data Scientist jobs in New York)
Output: list of the individual job urls of the jobs found in the query
"""
def extract_job_urls_from_result(query_soup): 
    urls = []
    for div in query_soup.find_all(name = "div", attrs = {"class":"row"}):
        for a in div.find_all(name = "a", attrs = {"data-tn-element":"jobTitle"}):
            this_url = a['href']
            to_go_url = "https://www.indeed.com/viewjob" + this_url[7:]
            urls.append(to_go_url)
    return(urls)

## Execution Parameters

In [3]:
job_titles_path = "..\..\Datasets\job_titles_small.csv"

city_list = ['New+York', 'Los+Angeles', 'Chicago', 'Houston',
             'Washington', 'Dallas', 'Seattle', 'Silicon+Valley',
             'Detroit', 'San+Francisco', 'Austin', 'Philadelphia',
             'Boston', 'Minneapolis', 'Phoenix', 'San+Jose']

jobs_perQuery_perCity = 25 #must be <= 50, take into account # of inaccessible ads
min_jobs_per_query = 35 #give some extra over the above (5-10 at least)

#Set accordingly to determine how many queries will be checked
jobs_stored = 6400 #total number of jobs stored in the .csv
queries_completed = jobs_stored // jobs_perQuery_perCity

#queries_completed = 0 #or set manually

## Program Execution

In [4]:
#Import the Job Titles Data
job_titles = pd.read_csv(job_titles_path, sep = ",")

#Create job titles list
job_list = list(job_titles.Title)

In [5]:
#Create queries URLs - a list of (query_job, query_url) tuples
queries_list = []

for query_job in job_list:
    for city in city_list:
#        for start in range(0, 150, 50):
        query_url = "http://www.indeed.com/jobs?q=" + query_job.replace(' ', '+') + \
                    "%2420%2C000&l=" + str(city) + \
                    "&limit=50" #+ "&start=" + str(start)
        queries_list.append((query_job, query_url))


### Main Loop

In [6]:
#Jupyter Progress Bar init
pbar = tqdm_notebook(total = len(queries_list[queries_completed:]), desc = "Checking...")

#Outer loop - go over the query results (many jobs in each)
for query_job, query_url in queries_list[queries_completed:]:

    #Send a request over the query URL and get a BeautifulSoup object out of it:
    page = requests.get(query_url, timeout = 300)
    soup = BeautifulSoup(page.text, "html.parser")

    #Retrieve the specific job urls from this listing
    job_urls = extract_job_urls_from_result(soup)

    if (len(job_urls) < min_jobs_per_query):
            print("Watch out! Result size for query:", query_url, "is:", len(job_urls))

#    print("Result size for query:", query_url, "is:", len(job_urls))    
    pbar.update(1) #update progress bar

pbar.close()

HBox(children=(IntProgress(value=0, description='Checking...', max=144), HTML(value='')))

KeyboardInterrupt: 