# Insight Data Science Consulting Project: 80,000 hours - Chapter 5

Note: this is a part of a consulting project with [80,000 hours](https://80000hours.org/).

## Stage 1: Ask a question

My objective is to rank skills (and possibly knowledge, tools & tech) based on how valuable they are. The skills are listed by US Department of Labor [here](https://www.onetonline.org/find/descriptor/browse/Skills/2.B.1/).

There is no performance measure for this rank yet since it is subjective. Yet in the future, one can create a poll to rate pairwise. 

## Stage 2: Set the environment up and get data

First, set up a directory for data and link it to this workplace. Download data into your choice of directory.

In [1]:
#Set up the environment
import pandas as pd                        #Pandas
import numpy as np                         #Numpy
import pycurl                              #For saving file from url
import os                                  #For checking if a file exists
from pandas.parser import CParserError     #For checking if a file contains a set of values
import matplotlib.pyplot as plt            #For plotting
import matplotlib
%matplotlib inline

#Some machine learning tools
from sklearn.linear_model import LassoCV, LassoLarsCV, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

#For radar graph plot
import numpy as np
import matplotlib
import matplotlib.path as path
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Set up data directory
DataDir = "C:/Users/Admin/Desktop/Insight/80000hrs/"

In [2]:
from bokeh.models import ColumnDataSource, HoverTool, OpenURL, TapTool, Div
from bokeh.plotting import figure, show, output_file
from bokeh.sampledata.periodic_table import elements
from bokeh.io import vplot

from bokeh.layouts import row, widgetbox, layout
from bokeh.models import Select, TextInput
from bokeh.palettes import Spectral5
from bokeh.plotting import curdoc, figure
from bokeh.sampledata.autompg import autompg

from bokeh.embed import components

#make output appeared in notebook
from bokeh.plotting import output_notebook
output_notebook()

## Stage 3+4+5+6: Feature exploration, scores, scores on 4 dimensions, scores for occupation.

See previous chapter.

## Stage 7: Validation

There is no direct validation. However, one can internally do a sanity check based on the raw data of outcomes. Another way is to compare the ranking of occupations based on scores of skills with other rankings including Glassdoor, CNNMoney, and USNews.

In [3]:
filename = "04/score_Occupation.csv"
dOccupationScore = pd.read_csv(DataDir+filename)
dOccupationScore = dOccupationScore.drop('Unnamed: 0', 1)

In [4]:
filename = "01/dJobZone.csv"
dJobZone = pd.read_csv(DataDir+filename)
dJobZone = dJobZone.drop('Unnamed: 0', 1)

In [5]:
filename = "01/dPayScale.csv"
dJobSatisfaction_PayScale = pd.read_csv(DataDir+filename)
dJobSatisfaction_PayScale = dJobSatisfaction_PayScale.drop('Unnamed: 0', 1)
dJobSatisfaction_PayScale = dJobSatisfaction_PayScale.drop(['Median Pay', 'High Meaning'], axis = 1)
dJobSatisfaction_PayScale.rename(columns={'High Satisfaction':'JobSatisfaction_PayScale'}, inplace=True)

In [6]:
filename = "01/dRiskOfAutomation.csv"
dRiskOfAutomation = pd.read_csv(DataDir+filename)
dRiskOfAutomation = dRiskOfAutomation.drop('Unnamed: 0', 1)
dRiskOfAutomation.rename(columns={'SOC code':'SOC code no decimal'}, inplace=True)

In [7]:
filename = "01/dWage_BLS.csv"
dWage_BLS = pd.read_csv(DataDir+filename)
dWage_BLS = dWage_BLS.drop('Unnamed: 0', 1)
dWage_BLS.rename(columns={'SOC code':'SOC code no decimal'}, inplace=True)

In [8]:
filename = "01/dJobSatisfaction_GSS.csv"
dJobSatisfaction_GSS = pd.read_csv(DataDir+filename)
dJobSatisfaction_GSS = dJobSatisfaction_GSS.drop('Unnamed: 0', 1)

#in order to run regression, we need to group by occupations first
#summarize by occupations
dJobSatisfaction_GSS = dJobSatisfaction_GSS.groupby(['SOC code']).mean().reset_index()
dJobSatisfaction_GSS.rename(columns={'SOC code':'SOC code no decimal'}, inplace=True)

In [9]:
#Merge with demical system
dbig = dOccupationScore
dbig = dbig.merge(dJobZone, left_on='SOC code',  right_on='SOC code', how ='left')
dbig = dbig.merge(dJobSatisfaction_PayScale, left_on='SOC code',  right_on='SOC code', how ='left')

In [10]:
dbig['SOC code no decimal'] = dbig['SOC code'].map(lambda x: x[0:7])

In [11]:
#Merge with non-decimal system
dbig = dbig.merge(dRiskOfAutomation, left_on='SOC code no decimal',  right_on='SOC code no decimal', how ='left')
dbig = dbig.merge(dWage_BLS, left_on='SOC code no decimal',  right_on='SOC code no decimal', how ='left')
dbig = dbig.merge(dJobSatisfaction_GSS, left_on='SOC code no decimal',  right_on='SOC code no decimal', how ='left')

In [12]:
dsmall = dbig.dropna()

In [13]:
len(dsmall)

269

In [14]:
dsmall.columns

Index([u'index', u'SOC code', u'Income', u'Satisfaction', u'Learnability',
       u'Security', u'score', u'color', u'Career Cluster', u'Career Pathway',
       u'Occupation', u'JobZone', u'JobSatisfaction_PayScale',
       u'SOC code no decimal', u'RiskOfAutomation', u'Wage_BLS',
       u'JobSatisfaction_GSS'],
      dtype='object')

In [15]:
count = 0
count_score_match = 0
d = dsmall
for i in range(0,len(d)):
    for j in range(i+1,len(d)):
        if (d.loc[d.index[i],'JobZone'] <=  d.loc[d.index[j],'JobZone'])& \
            (d.loc[d.index[i],'RiskOfAutomation'] <=  d.loc[d.index[j],'RiskOfAutomation'])& \
            (d.loc[d.index[i],'Wage_BLS'] >=  d.loc[d.index[j],'Wage_BLS'])& \
            (d.loc[d.index[i],'JobSatisfaction_PayScale'] >=  d.loc[d.index[j],'JobSatisfaction_PayScale'])& \
            (d.loc[d.index[i],'JobSatisfaction_GSS'] >=  d.loc[d.index[j],'JobSatisfaction_GSS']):
            count = count + 1
            if d.loc[d.index[i],'score'] >= d.loc[d.index[j],'score']:
                count_score_match = count_score_match + 1

In [16]:
count, count_score_match

(1514, 1514)

It means that we pass our sanity check internally. The strict order of all outcomes is consistent with order of scores based my method.

For external validation, we compare it with other rankings. See documentation for more information.

## Stage 8: Visualization on relationships among dimensions of score

In [17]:
interest = 'Skill'
#interest = 'Knowledge'

In [18]:
filename = "03/score_" + interest + ".csv" 
d4dimensions = pd.read_csv(DataDir+ filename)
d4dimensions = d4dimensions.drop('Unnamed: 0', 1)
d4dimensions.rename(columns={'index':interest}, inplace=True)
d4dimensions = d4dimensions.set_index(interest)

In [19]:
d4dimensions

Unnamed: 0_level_0,Income,Satisfaction,Security,Learnability,score,color,group,group_color
Skill,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Active Learning,2.634182,0.504576,2.069776,-2.341797,0.716684,0.811107,Basic,#FF0000
Active Listening,2.341124,0.42888,2.202526,-2.322009,0.66263,0.762498,Basic,#FF0000
Complex Problem Solving,3.147225,0.470851,2.023038,-2.354152,0.821741,0.905581,Complex Problem Solving,#FFFF00
Coordination,2.074343,0.447771,2.38327,-1.62998,0.818851,0.902983,Social,#FFA500
Critical Thinking,3.247886,0.528635,2.325277,-2.594317,0.87687,0.955158,Basic,#FF0000
Equipment Maintenance,-0.503275,-0.063316,-0.843287,0.680028,-0.182463,0.002531,Technical,#008000
Equipment Selection,-0.473784,-0.064737,-0.767879,0.654828,-0.162893,0.020129,Technical,#008000
Installation,-0.348337,0.0,-0.848958,0.456187,-0.185277,0.0,Technical,#008000
Instructing,1.562355,0.37889,1.707986,-1.577251,0.517995,0.632432,Social,#FFA500
Judgment and Decision Making,3.373525,0.547922,2.346067,-2.560573,0.926736,1.0,System,#800080


In [20]:
l = list(set(d4dimensions['group']))
source = [0]*len(l)

for i in range(0,len(l)):
    source[i] = ColumnDataSource(
                    data=dict(
                        Skill = d4dimensions.index[d4dimensions['group']==l[i]],
                        Income = d4dimensions.loc[d4dimensions['group']==l[i],'Income'],
                        Security = d4dimensions.loc[d4dimensions['group']==l[i],'Security'],
                        Satisfaction = d4dimensions.loc[d4dimensions['group']==l[i],'Satisfaction'],
                        Learnability = d4dimensions.loc[d4dimensions['group']==l[i],'Learnability'],
                        group = d4dimensions.loc[d4dimensions['group']==l[i],'group'],
                        group_color = d4dimensions.loc[d4dimensions['group']==l[i],'group_color'],
                    )
                )

In [22]:
x_axis = 'Learnability'
y_axis = 'Income'

output_file(DataDir + '05/' + x_axis + "_" + y_axis + ".html", title= x_axis + " vs. " + y_axis)

y = d4dimensions[y_axis]
X = d4dimensions[x_axis]

model_ols = LinearRegression(fit_intercept = False).fit(np.matrix(X).T, y)

x_predict = [np.min(X),np.max(X)]
y_predict = model_ols.predict(np.matrix(x_predict).T)

hover = HoverTool(
        tooltips=[
            ("Skill", "@Skill"),
        ]
    )

p = figure(plot_width=600, plot_height=600, tools=[hover], title= x_axis + " vs. " + y_axis)
p.xaxis.axis_label = x_axis
p.yaxis.axis_label = y_axis

for i in range(0,len(l)):
    p.circle(x_axis, y_axis, size=20, color = 'group_color', source=source[i], legend=l[i])
p.line(x = x_predict, y = y_predict)
show(p)


In [23]:
x_axis = 'Income'
y_axis = 'Security'

output_file(DataDir + '05/'+ x_axis + "_" + y_axis + ".html", title= x_axis + " vs. " + y_axis)

y = d4dimensions[y_axis]
X = d4dimensions[x_axis]

model_ols = LinearRegression(fit_intercept = False).fit(np.matrix(X).T, y)

x_predict = [np.min(X),np.max(X)]
y_predict = model_ols.predict(np.matrix(x_predict).T)

hover = HoverTool(
        tooltips=[
            ("Skill", "@Skill"),
        ]
    )

p2 = figure(plot_width=600, plot_height=600, tools=[hover], title= x_axis + " vs. " + y_axis)
for i in range(0,len(l)):
    p2.circle(x_axis, y_axis, size=20, color = 'group_color', source=source[i], legend=l[i])
p2.xaxis.axis_label = x_axis
p2.yaxis.axis_label = y_axis
p2.line(x = x_predict, y = y_predict)
p2.legend.location = "bottom_right"
show(p2);

In [24]:
x_axis = 'Income'
y_axis = 'Satisfaction'

output_file(DataDir + '05/' + x_axis + "_" + y_axis + ".html", title= x_axis + " vs. " + y_axis)

y = d4dimensions[y_axis]
X = d4dimensions[x_axis]

model_ols = LinearRegression(fit_intercept = False).fit(np.matrix(X).T, y)

x_predict = [np.min(X),np.max(X)]
y_predict = model_ols.predict(np.matrix(x_predict).T)

hover = HoverTool(
        tooltips=[
            ("Skill", "@Skill"),
        ]
    )

p = figure(plot_width=600, plot_height=600, tools=[hover], title= x_axis + " vs. " + y_axis)
for i in range(0,len(l)):
    p.circle(x_axis, y_axis, size=20, color = 'group_color', source=source[i], legend=l[i])
p.line(x = x_predict, y = y_predict)
p.xaxis.axis_label = x_axis
p.yaxis.axis_label = y_axis
p.legend.location = "bottom_right"
show(p)


We use the HTML files of these interactive plots in the website.