In [1]:
# Import neccesary programs
import requests
import bs4
from bs4 import BeautifulSoup
from lxml import html
import numpy as np
import pandas as pd

### Webscraping Functions

In [2]:
# function for getting location from indeed

def extract_location_from_result(loc):
    try:
        for n in loc.findAll(class_='location'):
            return n.text.strip()
    except:
        return 'None'


In [3]:
# function for getting company from indeed

def extract_company_from_result(comp):
    try:
        for n in comp.findAll(class_='company'):
            return n.text.strip()
    except:
        return 'None' 

In [4]:
# function for getting job title from indeed

def extract_jobtitle_from_result(jobtitle):
    try:
        for n in jobtitle.findAll(class_='jobtitle'):
            return n.text.strip()
    except:
        return 'None'   

In [5]:
# function to get salary from indeed

def extract_salary_from_result(sal):
    try:
        for n in sal.findAll('td', class_='snip'):
            return n.nobr
    except:
        return 'None' 

In [6]:
# function to get summary/description from indeed

def extract_summary_from_result(desc):
    try:
        for n in desc.findAll('span', class_='summary'):
            return n.text.strip()
    except:
        return 'None' 

### Select Website 

In [7]:
Location = []
Job_Title = []
Company = []
Salary = []
Description = []

url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city = 100
cities = ['New+York', 'Chicago', 'Boston', 'San+Francisco', 'Los+Angeles', 'Austin', 'Atlanta']
results = []
for city in set(cities):
    for start in range(0, max_results_per_city, 10):
        url = url_template.format(city,start)
        ru = requests.get(url)
        indeed = BeautifulSoup(ru.content, "lxml")
        for item in indeed.findAll('div', class_='result'):
            Location.append(extract_location_from_result(item))
            Job_Title.append(extract_jobtitle_from_result(item))
            Company.append(extract_company_from_result(item))
            Salary.append(extract_salary_from_result(item))
            Description.append(extract_summary_from_result(item))

In [8]:
# Create datafram and name columns. Include Cities
df = pd.DataFrame([Location,Job_Title,Company,Salary,Description]).T 
  
df.columns= ['Location','Job_Title','Company','Salary','Description']

In [9]:
df.head()

Unnamed: 0,Location,Job_Title,Company,Salary,Description
0,"Austin, TX 78730",Data Scientist,Cognitive Scale,,This position is specifically for people with ...
1,"Austin, TX 78730",Software Engineer - Machine Learning,Cognitive Scale,,The convergence of data and technology is tran...
2,"Austin, TX",Software Engineer I,CDK Global,,From data scientists to sales and operations e...
3,"Austin, TX",Research Scientist,University of Texas at Austin,"<nobr>$4,584 a month</nobr>",Provide statistical support on research and su...
4,"Austin, TX",Staff Data Scientist,HomeAway,,Experience in processing and analyzing Big dat...


In [10]:
df.dtypes

Location       object
Job_Title      object
Company        object
Salary         object
Description    object
dtype: object

### Cleaning Salary Data

In [11]:
df['Salary'].unique()

array([None, <nobr>$4,584 a month</nobr>,
       <nobr>$6,250 - $10,833 a month</nobr>,
       <nobr>$4,599 - $6,066 a month</nobr>, <nobr>$6,667 a month</nobr>,
       <nobr>$5,541 a month</nobr>, <nobr>$5,400 - $6,500 a month</nobr>,
       <nobr>$4,000 a month</nobr>, <nobr>$3,520 - $4,600 a month</nobr>,
       <nobr>$4,916 a month</nobr>, <nobr>$5,259 - $8,624 a month</nobr>,
       <nobr>$140,000 - $160,000 a year</nobr>,
       <nobr>$23.44 - $29.80 an hour</nobr>,
       <nobr>$5,259 - $6,941 a month</nobr>,
       <nobr>$4,500 - $4,900 a month</nobr>,
       <nobr>$120,000 - $150,000 a year</nobr>,
       <nobr>$120,000 a year</nobr>,
       <nobr>$80,000 - $110,000 a year</nobr>,
       <nobr>$180,000 a year</nobr>, <nobr>$110,000 a year</nobr>,
       <nobr>$104,674 - $151,358 a year</nobr>,
       <nobr>$105,000 a year</nobr>,
       <nobr>$135,000 - $165,000 a year</nobr>,
       <nobr>$110,000 - $155,000 a year</nobr>,
       <nobr>$100,000 - $150,000 a year</nobr>,
     

In [12]:
# Remove rows with no salary
df = df[df.Salary.notnull()]

In [13]:
# Remove excess salary info
df['Salary'] = df['Salary'].apply(lambda x: str(x))
df['Salary'] = df['Salary'].apply(lambda x: x.replace('$',''))
df['Salary'] = df['Salary'].apply(lambda x: x.replace(',',''))
df['Salary'] = df['Salary'].apply(lambda x: x.replace('<nobr>',''))
df['Salary'] = df['Salary'].apply(lambda x: x.replace('</nobr>',''))



In [14]:
# Parse yearly, monthly, hourly salaries
df['hourly_sal'] = df['Salary'].map(lambda x: 1 if 'hour' in x else 0)

df['monthly_sal'] = df['Salary'].map(lambda x: 1 if 'month' in x else 0)

df['yearly_sal'] = df['Salary'].map(lambda x: 1 if 'year' in x else 0)



In [19]:
# Drop monthly and hourly salaries
df_year1 = df[df.monthly_sal != 1]



In [21]:
df_year = df_year1[df.hourly_sal !=1]

  if __name__ == '__main__':


In [23]:
df_year.head()

Unnamed: 0,Location,Job_Title,Company,Salary,Description,hourly_sal,monthly_sal,yearly_sal
72,"Austin, TX",Senior Machine Learning Data Scientist,All-In Analytics,140000 - 160000 a year,"Machine Learning Data Scientist. Forecasting, ...",0,0,1
142,"Austin, TX",Machine Learning Engineer,Volt Workforce Solutions,120000 - 150000 a year,Data mining competition experience preferred (...,0,0,1
232,"Chicago, IL",Data Scientist -Fitness/Wellness Firm,Hirewell,120000 a year,Bring structure to large quantities of data - ...,0,0,1
245,"Chicago, IL",Data Scientist,Workbridge Associates,80000 - 110000 a year,Big data with Hadoop is a plus. A well-respect...,0,0,1
259,"Chicago, IL",Quantitative Research Analyst,GinasTechJobs.com,180000 a year,Define and implement data collection and data ...,0,0,1


In [25]:
df_year = df_year.drop('hourly_sal', axis=1)

In [26]:
df_year = df_year.drop('monthly_sal', axis=1)

In [28]:
df_year = df_year.drop('yearly_sal', axis=1)

In [32]:
df_year['Salary'].unique

<bound method Series.unique of 72      140000 - 160000 a year
142     120000 - 150000 a year
232              120000 a year
245      80000 - 110000 a year
259              180000 a year
273              110000 a year
274     104674 - 151358 a year
276              105000 a year
279     135000 - 165000 a year
297     110000 - 155000 a year
395     100000 - 150000 a year
441               34142 a year
489               60000 a year
492     120000 - 150000 a year
507       47860 - 67712 a year
559      90000 - 135000 a year
668               56000 a year
715              120000 a year
723       50000 - 60000 a year
755      88305 - 114802 a year
758      88305 - 114802 a year
760     104349 - 135656 a year
777               80000 a year
792              150000 a year
832     104349 - 135656 a year
858     150000 - 205000 a year
878              130000 a year
891       30000 - 32000 a year
895      88305 - 114802 a year
1032             150000 a year
1040             220000 a year
Name: Sa

In [35]:
# Change Salary values to floats
df_year['Salary'] = df_year['Salary'].str.replace('a year', '')
df_year['Salary'] = df_year['Salary'].str.replace('-', ',')
df_year['Salary'] = pd.to_numeric(df_year['Salary'], errors = 'coerce')

In [38]:
df_year.dtypes

Location        object
Job_Title       object
Company         object
Salary         float64
Description     object
dtype: object

In [None]:
# Replace salary ranges with means
df_year['Salary'] = df_year['Salary'].apply(lambda s: s.replace(value))

In [39]:
df_year['Salary'].map(np.mean)

72           NaN
142          NaN
232     120000.0
245          NaN
259     180000.0
273     110000.0
274          NaN
276     105000.0
279          NaN
297          NaN
395          NaN
441      34142.0
489      60000.0
492          NaN
507          NaN
559          NaN
668      56000.0
715     120000.0
723          NaN
755          NaN
758          NaN
760          NaN
777      80000.0
792     150000.0
832          NaN
858          NaN
878     130000.0
891          NaN
895          NaN
1032    150000.0
1040    220000.0
Name: Salary, dtype: float64

### Running Predictions

In [None]:

df = pd.get_dummies(df,columns = ['Location','Job_Title','Company'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 77) ## create train-test out of the data given
""" Fit a binary classification predictor."""
logreg = LogisticRegression(solver='liblinear')
C_vals = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs = GridSearchCV(logreg, {'penalty': penalties, 'C': C_vals}, verbose=False, cv=3)
gs.fit(X_train, y_train)

lr = LogisticRegressionCV(Cs=10, cv=5)
cv_model = logreg.fit(X_train, y_train)
cv_pred = cv_model.predict(X_test)

y_pred = lr.predict(X_test)
y_score = lr.decision_function(X_test) # Submit these response, the output of mode

In [None]:
conmat = np.array(confusion_matrix(Y_test, y_pred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['over_50k', 'under_50k'],
                            columns=['predicted_over50k','predicted_under50k'])

print(confusion)
print classification_report(Y_test,y_pred)
roc_auc_score(Y_test, y_score)

In [None]:
coef = pd.DataFrame([X.columns.values.tolist(), lr.coef_[0].tolist()], index=['features', 'coef']).T
coef.sort_values(by='coef', ascending=False)