- Find out what factors most directly impact salaries (title, location, department, etc.). In this case, we do not want to predict mean salary as would be done in a regression. Your boss believes that salary is better represented in categories than continuously
- Test, validate, and describe your models. What factors predict salary category? How do your models perform?
- Prepare a presentation for your Principal detailing your analysis.

In [9]:
# Import relevant libraries
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from spacy import English
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from IPython.core.display import HTML
from __future__ import unicode_literals

%matplotlib inline

In [10]:
path_to_phantom = '//Applications/phantomjs'

# Identify: Problem Statement / Aim

Our aim is to determine the factors that result in higher salaries for a data scientist.

# Acquire: Import Data Using Requests + BeautifulSoup

- Collect data on data science salary trends from a job listings aggregator for your analysis.
- Select and parse data from at least ~1000 postings for jobs, potentially from multiple location searches.

In [11]:
# create a webdriver PhantomJS object
driver = webdriver.PhantomJS(executable_path=path_to_phantom)
driver.set_window_size(1024,768)

In [12]:
# Indeed.com url formats
# http://www.indeed.com/jobs?q=data+scientist&l=New+York&start=10&pp=
# base - http://www.indeed.com/jobs?q=data+scientist&l=
# location - City, separated by +
# page 2 onwards - &start=10&pp=
# full url - base+location+page

url_top = 'http://www.indeed.com/jobs?q=data+scientist&l='
location = ['New+York', 'Seattle', 'San+Francisco', 'Boston']

In [13]:
# in the search results, each result is wrapped in a div class=' row result'

# for all results we want to get back
# company, span class='company'
# jobtitle, link class='jobtitle'
# location, span class='location'
# summary, span class='summary'
# salary, td class="snip", nobr with a $
# we need to account for missing data

# write a function that can retrieve job title, company, location, summary, and salary for each result
# if there are blanks, return '' for the first 4, and np.nan for salary
def get_details(each_item):
    try:
        salary_text = each_item.find('td', class_='snip').find('nobr').text
        salary_text = salary_text.split()
        salary = float(salary_text[0].strip('$').replace(',',''))
        try:
            job_title = each_item.find('h2', class_='jobtitle').text.strip('\n')
        except:
            job_title = ''

        try:
            company = each_item.find('span', class_='company').text.strip()
        except:
            company = ''

        try:
            location = each_item.find('span', class_='location').text.strip()
        except:
            location = ''

        try:
            summary = each_item.find('span', class_='summary').text.strip()
        except:
            summary = ''
            
        return [job_title, company, location, summary, salary]
    except:
        return 'skip'
    
    
    

In [14]:
entries_required = 10
entries_per_loc = entries_required / len(location)

In [16]:
data = []
for x in location:
    print x
    page = 1
    one_loc = []
    while len(one_loc) <= entries_per_loc:
        full_url = url_top + x
        if page != 1:
            page_url = '&start=' + str((page-1)*10) + '&pp='
            full_url = full_url + page_url
        #print full_url
        page += 1
        driver.get(full_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        for item in soup.findAll('div', class_='row result'):
            if get_details(item) != 'skip':
                one_loc.append(get_details(item))
    print one_loc
    data.extend(one_loc)
print 'done'

New+York


URLError: <urlopen error [Errno 61] Connection refused>

In [None]:
df = pd.DataFrame(data, columns=['job_title', 'company', 'location','summary','salary'])

In [None]:
df.head()

In [None]:
df.describe(include='all')

# Parse: Clean & Organize Data

# Model: Perform Logistic Regression

# Evaluate: Logistic Regression
# Bonus: Countvectorizer, Regularization Parameters

# Present: Write a report for your audience addressing findings & recommendations