# Web Scraping for Indeed.com & Predicting Salaries

In [2]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup

In [99]:
def get_jobs2(URL):
    '''
    Scrapes an URL and returns the content of div tags with three different classes: 
    'row result', ' row result', and 'row sjlast result'
    '''
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'lxml')
    j = []
    jobs = soup.find_all('div', {'class': 'row result'}) 
    for job in jobs:
        j.append(job)
    jobs = soup.find_all('div', {'class': ' row result'}) 
    for job in jobs:
        j.append(job)
    jobs = soup.find_all('div', {'class': 'row sjlast result'}) 
    for job in jobs:
        j.append(job)
    return j

In [74]:
def extract_field(s, tag, class_=None, field_=None, text=True):
    '''
    Extracts a field that contains a tag. An attribute and its value can be used to narrow down
    the output. It can return the text in the specified tag or a bs4 soup object from where 
    more filtering can be performed.
    '''
    try:
        if field_ == None:
            for d in s.find_all(tag):
                return d.text
        else:
            for d in s.find_all(tag, {class_:field_}):
                if text:
                    return d.text.strip()
                else:
                    return d
    except:
        return

def extract_location(s):
    return extract_field(s, 'span', 'class','location')
    
def extract_company(s):
    return extract_field(s, 'span', 'class','company')

def extract_job(s):
    return extract_field(s,'a')

def extract_salary(s):
    span = extract_field(s,'span', 'class', 'no-wrap')
    if span == None:
        span_aux = extract_field(s,'div', 'class', 'sjcl', text=False)
        #print span_aux
        span = extract_field(span_aux,'div', text=True)
    if span:
        return span.strip()
    else:
        return


In [101]:
def get_parse_jobs(url):
    '''
    Combining previous functions, this ones does the GET request and gets all necessary fields.
    It returns a dataframe.
    '''
    df = pd.DataFrame(columns=['location', 'company', 'title', 'salary'])
    jobs = get_jobs2(url)
    for job in jobs:
        loc = extract_location(job)
        comp = extract_company(job)
        title = extract_job(job)
        salary = extract_salary(job)
        job_series = pd.Series([loc, comp, title, salary], index=['location', 'company', 'title', 'salary'])
        df = df.append(job_series, ignore_index=True)
    return df

In [102]:
import time
import datetime

url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city = 5000 

# Init results
results = []

# Loop for cities and result pages
for city in set(['New+York', 'Chicago', 'San+Francisco', 'Austin', 'Seattle', 
    'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh', 
    'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami']):
    for start in range(0, max_results_per_city, 10):
        url = url_template
        url = url.replace('{}',city,1)
        url = url.replace('{}',str(start),1)
        query = get_parse_jobs(url)
        time.sleep(0.2)
        results.append(query)
    print city, datetime.datetime.now()

Houston 2017-06-21 22:41:56.100185
Phoenix 2017-06-21 22:49:03.561165
Chicago 2017-06-21 22:56:21.847519
San+Francisco 2017-06-21 23:03:42.040795
New+York 2017-06-21 23:11:51.978023
Dallas 2017-06-21 23:19:27.436621
Philadelphia 2017-06-21 23:27:04.974835
Denver 2017-06-21 23:34:37.023838
Los+Angeles 2017-06-21 23:42:14.142316
Pittsburgh 2017-06-21 23:49:22.286639
Miami 2017-06-22 00:01:33.940304
Atlanta 2017-06-22 00:09:50.531932
Seattle 2017-06-22 00:17:03.601376
Austin 2017-06-22 00:24:43.241152
Portland 2017-06-22 00:31:44.263943


In [103]:
# Joins the results list (list of dataframes)
r = pd.concat(results, ignore_index=True)

In [104]:
# Verify
r.head(10)

Unnamed: 0,location,company,title,salary
0,"Houston, TX",MD Anderson Cancer Center,Institute Assoc Scientist III - Neurodegenerat...,"$62,500 - $94,000 a year"
1,"Houston, TX",Preferred Sands,Research Scientist,
2,United States,Predictive Science,Data Scientist,
3,"Houston, TX",McKinsey & Company,"Analyst - Healthcare Analytics & Delivery, McK...",
4,"Houston, TX 77046 (Montrose area)",Sunnova Energy Corporation,Quantitative Analyst,
5,"Houston, TX 77073",Baker Hughes,Research and Development Scientist - Houston TX,
6,"Baytown, TX 77520",ExxonMobil,Polymer Material Informatics Research Scientist,
7,"Houston, TX 77032",Halliburton,Statistician,
8,"Houston, TX 77042 (West Houston area)",Tessella,Senior Oil & Gas Analytics Consultant,
9,"Houston, TX",MD Anderson Cancer Center,Computational Scientist,"$76,400 - $114,600 a year"


In [105]:
# Save to a .csv file
r.to_csv(path_or_buf='ds_jobs.csv', encoding='utf8')

In [None]:
# Using this code just to get the data
# Cleaning is the following step