# Web Scraping for Indeed.com and Predicting Data Science Salaries

In [10]:
#URL = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"
URL = "https://www.indeed.com/jobs?q=data+scientist+$20,000&l=New+York&start=10"

In [11]:
import requests
import bs4
from bs4 import BeautifulSoup

In [12]:
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'lxml')

In [13]:
jobs = soup.find_all('div', {'class':' row result'})
jobs[0]

<div class=" row result" data-jk="e3da67e5e297f668" data-tn-component="organicJob" id="p_e3da67e5e297f668" itemscope="" itemtype="http://schema.org/JobPosting">\n<h2 class="jobtitle" id="jl_e3da67e5e297f668">\n<a class="turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=e3da67e5e297f668&amp;fccid=8077183a161ef0fd" itemprop="title" onclick="setRefineByCookie(['salest']); return rclk(this,jobmap[0],true,0);" onmousedown="return rclk(this,jobmap[0],0);" rel="nofollow" target="_blank" title="Research Scientist">Research <b>Scientist</b></a>\n</h2>\n<span class="company" itemprop="hiringOrganization" itemtype="http://schema.org/Organization">\n<span itemprop="name">\n<a href="/cmp/New-York-University" onmousedown="this.href = appendParamsOnce(this.href, 'from=SERP&amp;campaignid=serp-linkcompanyname&amp;fromjk=e3da67e5e297f668&amp;jcid=fb8afb99610b0769')" target="_blank">\n    NYU School of Medicine</a></span>\n</span>\n\n - <a class="turnstileLink slNoUnderline " data-tn-element="r

In [14]:
def get_jobs(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'lxml')
    jobs = soup.find_all('div', {'class':'row sjlast result', 'class': ' row result',
                                 'class': 'row result'}) # ' row result'
    return jobs

In [15]:
def extract_field(s, tag, class_=None, field_=None):
    try:
        if field_ == None:
            for d in s.find_all(tag):
                return d.text
        else:
            for d in s.find_all(tag, {class_:field_}):
                return d.text.strip()
    except:
        return

def extract_location(s):
    return extract_field(s, 'span', 'class','location')
    
def extract_company(s):
    return extract_field(s, 'span', 'class','company')

def extract_job(s):
    return extract_field(s,'a')

def extract_salary(s):
    return extract_field(s,'span', 'class', 'no-wrap')


In [17]:
get_jobs(URL)
for job in jobs:
    print extract_location(job)
    print extract_company(job)
    print extract_job(job)
    print extract_salary(job)
    print '\n'

New York, NY
NYU School of Medicine
Research Scientist
None


New York, NY 10022 (Midtown area)
McKinsey & Company
Data Analyst - Wave
None


Albany, NY
Research Foundation for Mental Hygiene, Inc.
Research Scientist II
$82,222 a year


New York, NY 10022 (Midtown area)
McKinsey & Company
Analyst, Client Experience
None


New York, NY
BuzzFeed
HRIS Analyst
None


Brooklyn, NY
DEPARTMENT OF FINANCE
Data Analyst/Modeler
$70,286 - $80,829 a year


New York, NY
Balyasny
Quantitative Analyst, PM Development
None


New York, NY 10011 (Chelsea area)
Spotify
Data Science Manager - Premium Analytics
None


New York, NY
Columbia University
Statistical Programmer/Data Manager - General Medicine
None




In [18]:
import pandas as pd
def get_parse_jobs(url):
    df = pd.DataFrame(columns=['location', 'company', 'title', 'salary'])
    jobs = get_jobs(url)
    for job in jobs:
        loc = extract_location(job)
        comp = extract_company(job)
        title = extract_job(job)
        salary = extract_salary(job)
        job_series = pd.Series([loc, comp, title, salary], index=['location', 'company', 'title', 'salary'])
        df = df.append(job_series, ignore_index=True)
    return df

In [19]:
import time
import datetime
url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city = 5000 # Set this to a high-value (5000) to generate more results. 
# Crawling more results, will also take much longer. First test your code on a small number of results and then expand.

results = []

for city in set(['New+York', 'Chicago', 'San+Francisco', 'Austin', 'Seattle', 
    'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh', 
    'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami']):
    for start in range(0, max_results_per_city, 10):
        url = url_template
        url = url.replace('{}',city,1)
        url = url.replace('{}',str(start),1)
        #print url
        query = get_parse_jobs(url)
        time.sleep(0.2)
        results.append(query)
    print city, datetime.datetime.now()

Houston 2017-06-18 11:13:26.667428
Phoenix 2017-06-18 11:23:58.004905
Chicago 2017-06-18 11:31:20.338350
San+Francisco 2017-06-18 11:37:37.313445
New+York 2017-06-18 11:44:22.599938
Dallas 2017-06-18 11:51:06.939656
Philadelphia 2017-06-18 11:57:53.769004
Denver 2017-06-18 12:04:31.328390
Los+Angeles 2017-06-18 12:11:07.371096
Pittsburgh 2017-06-18 12:17:22.014272
Miami 2017-06-18 12:23:24.911243
Atlanta 2017-06-18 12:29:30.191094
Seattle 2017-06-18 12:35:42.680322
Austin 2017-06-18 12:42:25.025098
Portland 2017-06-18 12:48:57.328052


In [20]:
r = pd.concat(results, ignore_index=True)

In [21]:
r.head(10)

Unnamed: 0,location,company,title,salary
0,"Houston, TX",Preferred Sands,Research Scientist,
1,"Houston, TX",Invesco,Lead Data Scientist,
2,"Houston, TX 77057",Wood Mackenzie,"Research Analyst, US Upstream Research",
3,United States,Predictive Science,Data Scientist,
4,"Houston, TX 77079",Johnson Service Group Inc.,"Data Analysis, Mining and Modeling Specialist ...",
5,"Houston, TX 77057",Wood Mackenzie,"Research Analyst, US Upstream Research",
6,United States,Predictive Science,Data Scientist,
7,"Houston, TX 77057",Wood Mackenzie,"Research Analyst, US Upstream Research",
8,"Houston, TX",Invesco,Lead Data Scientist,
9,United States,Predictive Science,Data Scientist,


In [22]:
r.to_csv(path_or_buf='ds_jobs.csv', encoding='utf8')

In [None]:
# Using this code just to get the data
# cleaning_code_project_4.ipyb is the following step