# Hack the job market like a data scientist
## Market
How big is the market for a job-seeker's profile? How does it differ by region?

Only gross numbers are available from BLS
http://www.bls.gov/emp/ep_table_201.htm
- How many jobs in a sector?
Prof Services has 19m out of 150m total, or 15%.
http://www.bls.gov/oes/current/oes_nat.htm
- Computer and math
4m total employment, avg $86k a year

## Problem
How big is the job market for a job-seeker's skills, location and salary expectations?

## Solution
Text analysis and parse phrases through Indeed search. Summarize value of various phrases. Calculate market size from Indeed "Jobs above this amount" grid.


In [12]:
import pandas as pd
import requests
import lxml.html
from lxml.html.clean import Cleaner
import re
#import json
from time import sleep
from random import randint
import csv
import matplotlib.pyplot as plt


import numpy as np
import scipy as sp
import scipy.stats as stats

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [4]:
file = 'data/resume.txt'

f = open(file, 'r')


text = f.read()


#vect = CountVectorizer(ngram_range=(1, 2))

blob = TextBlob(text)
phrases = blob.noun_phrases

# for phrase in phrases:
#     print(phrase)
    
print(phrases)

['doll', 'kilbourne pl', 'washington dc', 'steven @ clearskies.co 202-386-0190 stevendoll github.com/stevendoll www.linkedin.com/ /in/stevendoll', 'mba', 'it', 'tangible business value', 'agile app development', 'enterprise software deployment', 'cloud implementations', 'data science initiatives', 'custom software', 'security threats', 'real time', 'faa', 'peacekeeping', 'three-person team', 'it', 'emergency management center', 'early-stage', 'washington', 'dc lightsense', 'cleantech', 'semifinalist', 'ir', 'internet', 'iot', 'clear skies', 'dc founder', 'business barriers', 'energy efficiency technologies', 'rigil', 'corporation /', 'senior product', 'washington', 'dc working', 'cio', 'aviation', '$ 9m portfolio', 'it', 'business case', 'contract structure', 'peacekeeping', '/ field', 'it specialist', 'addis ababa', 'ethiopia', 'it specialist', 'ip', 'open-source software', 'new technologies', 'assigned', 'network operations', 'addis ababa', '$ 1.1m', 'freerange studios', 'others /', 

In [5]:
def parse_job_words(keywords):
    
    base_url = 'http://www.indeed.com/jobs?q='
    url_suffix = '&l=Washington%2C+DC'
    headers = {'User-agent':'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}

    main_results = []

    for term in keywords:
    
        dat = {}

        url = ''.join([base_url, str(term.replace(' ','+')), url_suffix])

        print('Processing term %s %s' % (term.replace('+',' '), url))

        try:

            #random time delay for scraping
            sleep(randint(0,5))

            r = requests.get(url, headers=headers)
            
            doc = lxml.html.fromstring(r.content)

            # request status
            dat['term'] = term.replace('+',' ')
            #dat['status_code'] = r.status_code
            dat['url'] = url
            
            if len(doc.cssselect('#SALARY_rbo ul li')) > 1:

                for i in range(0,len(doc.cssselect('#SALARY_rbo ul li'))):

                    row = doc.cssselect('#SALARY_rbo ul li')[i].text_content().strip()

                    dat['salary'+str(i)] = re.compile('\$(.*)\+').findall(row)[0].replace(',','')
                    dat['positions'+str(i)] = re.compile('\((.*)\)').findall(row)[0].replace(',','')

                main_results.append(dat)

        except:
            print('Error getting detail for term %s %s' % (term.replace('+',' '), url))



    print('Done processing job listings')
    return main_results


if __name__ == '__main__':
    
    #get results
    #keywords = ['machine learning','"ruby+on+rails"','"data+science"','wordpress','"energy+efficiency"','startup','mba','aws','cloud','cloud+mba','bdd','python','python+pandas','scikit-learn','mba+"data+science"','startup+data+science']
    
    keywords = phrases #[0:5]
    
    keywords = parse_job_words(keywords)
    
    df = pd.DataFrame.from_dict(keywords, dtype=None)
    
    print(df.head())

    #create filename
    #filename = location.replace('--', '_').replace('-', '_').lower() + '.csv'
    filename = '2016-06-16-job-words.csv'
    
    #get all keys
    headings = sorted(list(set().union(*(d.keys() for d in keywords))))
    
    #write to csv file
    with open(filename, 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, headings)
        dict_writer.writeheader()
        dict_writer.writerows(keywords)



Processing term doll http://www.indeed.com/jobs?q=doll&l=Washington%2C+DC
Processing term kilbourne pl http://www.indeed.com/jobs?q=kilbourne+pl&l=Washington%2C+DC
Processing term washington dc http://www.indeed.com/jobs?q=washington+dc&l=Washington%2C+DC
Processing term steven @ clearskies.co 202-386-0190 stevendoll github.com/stevendoll www.linkedin.com/ /in/stevendoll http://www.indeed.com/jobs?q=steven+@+clearskies.co+202-386-0190+stevendoll+github.com/stevendoll+www.linkedin.com/+/in/stevendoll&l=Washington%2C+DC
Processing term mba http://www.indeed.com/jobs?q=mba&l=Washington%2C+DC
Processing term it http://www.indeed.com/jobs?q=it&l=Washington%2C+DC
Processing term tangible business value http://www.indeed.com/jobs?q=tangible+business+value&l=Washington%2C+DC
Processing term agile app development http://www.indeed.com/jobs?q=agile+app+development&l=Washington%2C+DC
Processing term enterprise software deployment http://www.indeed.com/jobs?q=enterprise+software+deployment&l=Washi

In [6]:
# clean up df

df.fillna(value=0, inplace=True)

df['positions0'] = df.positions0.astype(int)
df['positions1'] = df.positions1.astype(int)
df['positions2'] = df.positions2.astype(int)
df['positions3'] = df.positions3.astype(int)
df['positions4'] = df.positions4.astype(int)
df['salary0'] = df.salary0.astype(int)
df['salary1'] = df.salary1.astype(int)
df['salary2'] = df.salary2.astype(int)
df['salary3'] = df.salary3.astype(int)
df['salary4'] = df.salary4.astype(int)

In [8]:
for index, row in df.iterrows():
    salaries = []
    
    for i in range(0,5):
        #if i = 4 or positions of next element is 0, then this is the top bin
        if i == 4 or row['positions'+str(i+1)] == 0:
            salaries += row['positions'+str(i)] * [(row['salary'+str(i)] + (row['salary'+str(i)] - row['salary'+str(i-1)])*1.25)]        
        else:    
            salaries += row['positions'+str(i)] * [(row['salary'+str(i)] + row['salary'+str(i+1)])/2]

    salary_series = pd.Series(salaries)
    
    df.loc[index, 'mean'] = salary_series.mean()
    df.loc[index, 'positions'] = salary_series.count()
    df.loc[index, 'std'] = salary_series.std()
    
    df.loc[index, 'above_100k'] = (1-stats.norm.cdf(100000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_105k'] = (1-stats.norm.cdf(105000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_110k'] = (1-stats.norm.cdf(110000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_115k'] = (1-stats.norm.cdf(115000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_120k'] = (1-stats.norm.cdf(120000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_125k'] = (1-stats.norm.cdf(125000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_130k'] = (1-stats.norm.cdf(130000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_135k'] = (1-stats.norm.cdf(135000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_140k'] = (1-stats.norm.cdf(140000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    df.loc[index, 'above_150k'] = (1-stats.norm.cdf(150000, loc=salary_series.mean(), scale=salary_series.std()))*salary_series.count()
    


In [9]:
# calibrate the top tier skew from outliers
df['top_tier_dev'] = 0
df.loc[df.salary4 == 115000, 'top_tier_dev'] = df['above_115k'] - df['positions4']
df.loc[df.salary4 == 120000, 'top_tier_dev'] = df['above_120k'] - df['positions4']
df.loc[df.salary4 == 125000, 'top_tier_dev'] = df['above_125k'] - df['positions4']
df.loc[df.salary4 == 130000, 'top_tier_dev'] = df['above_130k'] - df['positions4']
df.loc[df.salary4 == 135000, 'top_tier_dev'] = df['above_135k'] - df['positions4']
df.top_tier_dev.sum()

-91.6391663059525

In [19]:
df_summary = df[['term','mean','positions','std','above_110k','above_120k','above_130k']].sort_values('mean', ascending=False)[0:20]
df_summary

Unnamed: 0,term,mean,positions,std,above_110k,above_120k,above_130k
7,cloud implementations,103387.743864,6356.0,11793.385211,1827.410762,505.146683,76.388876
42,open-source software,101594.243792,4430.0,11832.034607,1057.532954,265.371159,36.240753
5,agile app development,101300.0,825.0,9518.090848,148.785111,20.398669,1.058954
59,lead engineer,98239.179954,13170.0,12274.711649,2225.697945,502.165521,63.660561
63,four-person team,96666.666667,3.0,20207.259422,0.764045,0.37232,0.148546
6,enterprise software deployment,96552.479411,5707.0,11779.870136,723.74379,132.798433,12.898213
73,mba,95399.363898,4166.0,17626.001514,848.755614,339.121593,103.402062
2,mba,95399.363898,4166.0,17626.001514,848.755614,339.121593,103.402062
64,$ 5m project,93698.979592,98.0,17996.382769,17.887193,7.050553,2.140422
21,iot,92664.893617,235.0,15516.16167,31.007828,9.178736,1.893994


In [26]:
from math import sqrt
# fig = plt.figure()
# ax = fig.add_subplot(1,1,1)

x = []
y = []
color = []
area = []

for data in df_summary:
    x.append(int(data[3])) # all positions
    y.append(data[2]) # mean salary
    #color.append(data[7]) # highest salaries 
    #area.append(sqrt(data[5])) # positions over $110k
    
    # plotting the first eigth letters of the state's name
    #text(data[3], data[2], data[3],size=11,horizontalalignment='center')

# making the scatter plot
sct = scatter(x, y, c=color, s=area, linewidths=2, edgecolor='w')
sct.set_alpha(0.75)

axis([0,11,200,1280])
xlabel('Murders per 100,000 population')
ylabel('Burglaries per 100,000 population')
show()



ValueError: invalid literal for int() with base 10: 'm'