# Levels.fyi Software Engineer Compensation

In [1004]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1005]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import itertools

import sklearn.linear_model as skl_lm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge, RidgeCV

import statsmodels.api as sm
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 100)

In [1006]:
comp = pd.read_csv('compensation.csv')
comp.dtypes
comp.head(10)

Date                          object
Company                       object
Title                         object
Level                         object
Standard Level                object
Skill Index                  float64
Location                      object
Total Yearly Compensation      int64
Base Salary (/year)          float64
Stock Grant (/year)          float64
Bonus (/year)                float64
Years of Experience            int64
Years at Company             float64
Tag                           object
Gender                        object
Other Details                 object
dtype: object

Unnamed: 0,Date,Company,Title,Level,Standard Level,Skill Index,Location,Total Yearly Compensation,Base Salary (/year),Stock Grant (/year),Bonus (/year),Years of Experience,Years at Company,Tag,Gender,Other Details
0,2/5/2020,JPMorgan Chase,Software Engineer,Senior Associate,Software Engineer,28.57,"London, EN, United Kingdom",87000,78000.0,,9000.0,3,1.0,Full Stack,Male,
1,2/5/2020,TripAdvisor,Software Engineer,Principal Software Engineer,,,"Oxford, EN, United Kingdom",212000,121000.0,70000.0,21000.0,17,4.0,Distributed Systems (Back-End),Male,
2,2/5/2020,Salesforce,Software Engineer,Lead MTS,Senior Engineer,61.0,"San Francisco, CA",275000,200000.0,45000.0,30000.0,14,1.0,Testing (SDET),Male,
3,2/5/2020,Adobe,Software Engineer,Senior Software Engineer (5),Senior Engineer,44.55,"San Jose, CA",270000,190000.0,60000.0,20000.0,12,1.0,Distributed Systems (Back-End),,
4,2/5/2020,Rokt,Software Engineer,L3,,,"Sydney, NS, Australia",133000,115000.0,10000.0,8000.0,2,1.0,Web Development (Front-End),Male,
5,2/5/2020,Turbonomic,Software Engineer,Senior,,,"New York, NY",180000,155000.0,,25000.0,5,2.0,Web Development (Front-End),Male,Masters
6,2/5/2020,Salesforce,Software Engineer,Senior MTS,Senior Engineer,44.5,"San Francisco, CA",192000,177000.0,,15000.0,12,4.0,Full Stack,Male,Masters
7,2/5/2020,Uber,Software Engineer,Senior Software Engineer,Senior Engineer,50.0,"San Francisco, CA",250000,162000.0,72000.0,16000.0,4,4.0,Distributed Systems (Back-End),Male,
8,2/5/2020,Amazon,Software Engineer,SDE I,Entry Level Engineer,10.87,"Seattle, WA",158000,112000.0,20000.0,26000.0,0,0.0,Distributed Systems (Back-End),Female,Masters
9,2/5/2020,Amazon,Software Engineer,Principal SDE,Staff Engineer,75.0,"Toronto, ON, Canada",343000,165000.0,178000.0,0.0,36,2.0,Distributed Systems (Back-End),Male,


## Data Cleaning

In [1007]:
def comp_tidy_cols(comp):
    # Make column names nicer
    comp.columns = map(str.lower, comp.columns.str.replace(' ', '_'))
    comp.rename(columns={
        'total_yearly_compensation': 'total_comp',
        'base_salary_(/year)': 'salary',
        'stock_grant_(/year)': 'stock',
        'bonus_(/year)': 'bonus',
        'years_of_experience': 'years_experience',
        'years_at_company': 'years_company',
    }, inplace=True)

    # Title: All entries is "Software Engineer" so this isn't particularly useful info 
    # Standard Level & Skill Index: Columns are determined by Levels.fyi; preferable not to confound results based on blackbox algorithm
    # Date: Unlikely to do time-series analysis, so will simply drop date at this stage
    # Level: Unstructured/inconsistent values between companies; organizing/cleaning this will be too difficult and out-of-scope
    comp.drop(columns=['standard_level', 'skill_index', 'date', 'title', 'level'], inplace=True)

    return comp
    
comp = comp_tidy_cols(comp)
comp.head()

Unnamed: 0,company,location,total_comp,salary,stock,bonus,years_experience,years_company,tag,gender,other_details
0,JPMorgan Chase,"London, EN, United Kingdom",87000,78000.0,,9000.0,3,1.0,Full Stack,Male,
1,TripAdvisor,"Oxford, EN, United Kingdom",212000,121000.0,70000.0,21000.0,17,4.0,Distributed Systems (Back-End),Male,
2,Salesforce,"San Francisco, CA",275000,200000.0,45000.0,30000.0,14,1.0,Testing (SDET),Male,
3,Adobe,"San Jose, CA",270000,190000.0,60000.0,20000.0,12,1.0,Distributed Systems (Back-End),,
4,Rokt,"Sydney, NS, Australia",133000,115000.0,10000.0,8000.0,2,1.0,Web Development (Front-End),Male,


In [1008]:
comp.isna().sum(axis=0)

company               0
location              0
total_comp            0
salary               20
stock               163
bonus                29
years_experience      0
years_company         0
tag                   0
gender               96
other_details       335
dtype: int64

In [1009]:
def comp_fill_vals(comp):
    comp.salary.replace(to_replace = np.NaN, value = comp.total_comp, inplace = True)
    comp.stock.replace(to_replace = np.NaN, value = 0.0, inplace = True)
    comp.bonus.replace(to_replace = np.NaN, value = 0.0, inplace = True)
    comp.gender.replace(to_replace = np.NaN, value = 'Unknown', inplace = True)
    comp.other_details.replace(to_replace = np.NaN, value = '', inplace = True)
    return comp

comp = comp_fill_vals(comp)
comp.isna().sum(axis=0)

company             0
location            0
total_comp          0
salary              0
stock               0
bonus               0
years_experience    0
years_company       0
tag                 0
gender              0
other_details       0
dtype: int64

In [1010]:
comp_dummies_base = comp.copy(deep=True)

def comp_create_dummies(comp_base, comp_create):
    # Heuristically select the companies with 5 or more data points
    dummies = pd.get_dummies(comp_base.company)
    print(dummies)
    d_companies = comp_base.company.value_counts()
    d_companies = d_companies[d_companies >= 5]
    comp_create = comp_create.join(dummies[d_companies.index], rsuffix='_company')

    # Heuristically select the locations with 5 or more data points
    dummies = pd.get_dummies(comp_base.location)
    d_locations = comp_base.location.value_counts()
    d_locations = d_locations[d_locations >= 5]
    comp_create = comp_create.join(dummies[d_locations.index], rsuffix='_location')

    # Heuristically select those tags with more than 1 data points
    dummies = pd.get_dummies(comp_base.tag)
    d_tags = comp_base.tag.value_counts()
    d_tags = d_tags[d_tags >= 2]
    comp_create = comp_create.join(dummies[d_tags.index], rsuffix='_tag')

    # Use all gender values
    dummies = pd.get_dummies(comp_base[['gender']])
    d_genders = comp_base.gender.value_counts()
    comp_create = pd.concat([comp_create, dummies], axis=1)

    # Manually create dummies based on other details
    comp_create['masters'] = comp_create.other_details.str.lower().str.contains('master').factorize()[0]
    comp_create['phd'] = comp_create.other_details.str.lower().str.contains('phd').factorize()[0]

    # Remove unnecessary columns after done
    comp_create.drop(columns=['company', 'location', 'total_comp', 'tag', 'gender', 'other_details'], inplace=True)
    comp_create = comp_create.astype('float')
        
    return comp_create

comp = comp_create_dummies(comp_dummies_base, comp)
comp.head(15)

     3M  ABC  ADF  AT&T  AWL  Able  Accenture  AcuityAds  Adobe  Airbnb  ...  \
0     0    0    0     0    0     0          0          0      0       0  ...   
1     0    0    0     0    0     0          0          0      0       0  ...   
2     0    0    0     0    0     0          0          0      0       0  ...   
3     0    0    0     0    0     0          0          0      1       0  ...   
4     0    0    0     0    0     0          0          0      0       0  ...   
..   ..  ...  ...   ...  ...   ...        ...        ...    ...     ...  ...   
494   0    0    0     0    0     0          0          0      0       0  ...   
495   0    0    0     0    0     0          0          0      0       0  ...   
496   0    0    0     0    0     0          1          0      0       0  ...   
497   0    0    0     0    0     0          0          0      0       0  ...   
498   0    0    0     0    0     0          0          0      0       0  ...   

     Whole Foods Market  Xinova  Yahoo 

Unnamed: 0,salary,stock,bonus,years_experience,years_company,Amazon,Microsoft,Google,Facebook,Capital One,...,Android,Site Reliability (SRE),Security,Mobile (iOS + Android),gender_Female,gender_Male,gender_Other,gender_Unknown,masters,phd
0,78000.0,0.0,9000.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,121000.0,70000.0,21000.0,17.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,200000.0,45000.0,30000.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,190000.0,60000.0,20000.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,115000.0,10000.0,8000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,155000.0,0.0,25000.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6,177000.0,0.0,15000.0,12.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,162000.0,72000.0,16000.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,112000.0,20000.0,26000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,165000.0,178000.0,0.0,36.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [1002]:
comp_test = pd.read_csv('test.csv')
comp_test = comp_tidy_cols(comp_test)
comp_test = comp_fill_vals(comp_test)
comp_test
comp_test = comp_create_dummies(comp_dummies_base, comp_test)
comp_test

Unnamed: 0,company,location,total_comp,salary,stock,bonus,years_experience,years_company,tag,gender,other_details
0,Bloomberg,"New York, NY",185000,160000,0.0,25000,4,4,Full Stack,Male,
1,Amazon,"New York, NY",185000,160000,0.0,25000,5,0,Full Stack,Male,Masters


Unnamed: 0,salary,stock,bonus,years_experience,years_company,Amazon,Microsoft,Google,Facebook,Capital One,...,Android,Site Reliability (SRE),Security,Mobile (iOS + Android),gender_Female,gender_Male,gender_Other,gender_Unknown,masters,phd
0,160000.0,0.0,25000.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,160000.0,0.0,25000.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,,,,,,,,,,,...,,,,,0.0,1.0,0.0,0.0,-1.0,-1.0
3,,,,,,,,,,,...,,,,,0.0,0.0,0.0,1.0,-1.0,-1.0
4,,,,,,,,,,,...,,,,,0.0,1.0,0.0,0.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,,,,,,,,,,,...,,,,,0.0,0.0,0.0,1.0,-1.0,-1.0
495,,,,,,,,,,,...,,,,,0.0,1.0,0.0,0.0,-1.0,-1.0
496,,,,,,,,,,,...,,,,,0.0,1.0,0.0,0.0,-1.0,-1.0
497,,,,,,,,,,,...,,,,,0.0,1.0,0.0,0.0,-1.0,-1.0


In [929]:
def comp_scale_dollars(comp):
    # Scale down dollar values for clarity, so they appear in the thousands
    comp.salary = comp.salary / 1000
    comp.stock = comp.stock / 1000
    comp.bonus = comp.bonus / 1000
    return comp

comp = comp_scale_dollars(comp)
comp.head(15)

Unnamed: 0,salary,stock,bonus,years_experience,years_company,Amazon,Microsoft,Google,Facebook,Capital One,...,Android,Site Reliability (SRE),Security,Mobile (iOS + Android),gender_Female,gender_Male,gender_Other,gender_Unknown,masters,phd
0,78.0,0.0,9.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,121.0,70.0,21.0,17.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,200.0,45.0,30.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,190.0,60.0,20.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,115.0,10.0,8.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,155.0,0.0,25.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6,177.0,0.0,15.0,12.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,162.0,72.0,16.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,112.0,20.0,26.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,165.0,178.0,0.0,36.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [930]:
# Make sure to scale salary, stock, bonus, years of experience, and years at company
def comp_scale_regr(comp_base, comp_scale, prediction_col):
    original_prediction_col = comp_scale[prediction_col]
    
    base_comp_cols = comp_base.iloc[:,:5]
    base_comp_remaining = comp_base.iloc[:,5:]
    base_comp_scaler = StandardScaler().fit(base_comp_cols)

    scale_comp_cols = comp_scale.iloc[:,:5]
    scale_comp_remaining = comp_scale.iloc[:,5:]
    scaled_comp = pd.DataFrame(
        base_comp_scaler.transform(scale_comp_cols),
        index = scale_comp_cols.index,
        columns = scale_comp_cols.columns)
    scaled_comp = scaled_comp.join(scale_comp_remaining).astype('float')
    
    y = original_prediction_col
    X = scaled_comp.drop(columns=['salary', 'stock', 'bonus'])
    
    return X, y

X, y = comp_scale_regr(comp, comp, 'salary')
X.head(15)

Unnamed: 0,years_experience,years_company,Amazon,Microsoft,Google,Facebook,Capital One,Wayfair,SAP,Uber,...,Android,Site Reliability (SRE),Security,Mobile (iOS + Android),gender_Female,gender_Male,gender_Other,gender_Unknown,masters,phd
0,-0.630068,-0.534276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.969845,0.476314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.412721,-0.534276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.041305,-0.534276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.815776,-0.534276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,-0.258652,-0.197413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6,1.041305,0.476314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,-0.44436,0.476314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,-1.187192,-0.87114,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,5.498299,-0.197413,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Ridge Regression

In [931]:
alphas = 10**np.linspace(10,-2,100)
alphas

array([1.00000000e+10, 7.56463328e+09, 5.72236766e+09, 4.32876128e+09,
       3.27454916e+09, 2.47707636e+09, 1.87381742e+09, 1.41747416e+09,
       1.07226722e+09, 8.11130831e+08, 6.13590727e+08, 4.64158883e+08,
       3.51119173e+08, 2.65608778e+08, 2.00923300e+08, 1.51991108e+08,
       1.14975700e+08, 8.69749003e+07, 6.57933225e+07, 4.97702356e+07,
       3.76493581e+07, 2.84803587e+07, 2.15443469e+07, 1.62975083e+07,
       1.23284674e+07, 9.32603347e+06, 7.05480231e+06, 5.33669923e+06,
       4.03701726e+06, 3.05385551e+06, 2.31012970e+06, 1.74752840e+06,
       1.32194115e+06, 1.00000000e+06, 7.56463328e+05, 5.72236766e+05,
       4.32876128e+05, 3.27454916e+05, 2.47707636e+05, 1.87381742e+05,
       1.41747416e+05, 1.07226722e+05, 8.11130831e+04, 6.13590727e+04,
       4.64158883e+04, 3.51119173e+04, 2.65608778e+04, 2.00923300e+04,
       1.51991108e+04, 1.14975700e+04, 8.69749003e+03, 6.57933225e+03,
       4.97702356e+03, 3.76493581e+03, 2.84803587e+03, 2.15443469e+03,
      

In [932]:
kf_10 = KFold(n_splits=10, shuffle=True, random_state=0)

ridgecv = skl_lm.RidgeCV(alphas=alphas, cv=kf_10, scoring='neg_mean_squared_error')
ridgecv.fit(X, y)
ridgecv.alpha_

optimal_ridge = skl_lm.Ridge()
optimal_ridge.set_params(alpha=ridgecv.alpha_)
optimal_ridge.fit(X, y)
mean_squared_error(y, optimal_ridge.predict(X))

RidgeCV(alphas=array([1.00000000e+10, 7.56463328e+09, 5.72236766e+09, 4.32876128e+09,
       3.27454916e+09, 2.47707636e+09, 1.87381742e+09, 1.41747416e+09,
       1.07226722e+09, 8.11130831e+08, 6.13590727e+08, 4.64158883e+08,
       3.51119173e+08, 2.65608778e+08, 2.00923300e+08, 1.51991108e+08,
       1.14975700e+08, 8.69749003e+07, 6.57933225e+07, 4.97702356e+07,
       3.76493581e+07, 2.84803587e+0...
       6.57933225e-01, 4.97702356e-01, 3.76493581e-01, 2.84803587e-01,
       2.15443469e-01, 1.62975083e-01, 1.23284674e-01, 9.32603347e-02,
       7.05480231e-02, 5.33669923e-02, 4.03701726e-02, 3.05385551e-02,
       2.31012970e-02, 1.74752840e-02, 1.32194115e-02, 1.00000000e-02]),
        cv=KFold(n_splits=10, random_state=0, shuffle=True), fit_intercept=True,
        gcv_mode=None, normalize=False, scoring='neg_mean_squared_error',
        store_cv_values=False)

2.656087782946684

Ridge(alpha=2.656087782946684, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

Ridge(alpha=2.656087782946684, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

1597.4823388831362

In [933]:
optimal_ridge.fit(X, y)
pd.Series(optimal_ridge.coef_.flatten(), index=X.columns)

Ridge(alpha=2.656087782946684, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

years_experience                  19.350392
years_company                     -2.520118
Amazon                             5.246988
Microsoft                          4.064716
Google                            12.802047
Facebook                          34.571873
Capital One                        8.152820
Wayfair                           -3.254125
SAP                              -13.478983
Uber                              18.276240
Salesforce                        20.873254
Bloomberg                         20.214277
eBay                               8.074744
IBM                               -3.088971
Cisco                             -2.777379
Oracle                             4.867940
Epic Systems                      12.468754
Yahoo                             13.210225
Yelp                             -17.296366
Apple                             16.913280
JPMorgan Chase                    -5.147603
Adobe                             -0.461608
Goldman Sachs                   

## Lasso

In [934]:
kf_10 = KFold(n_splits=10, shuffle=True, random_state=0)

lassocv = skl_lm.LassoCV(alphas = alphas, cv=kf_10, max_iter=10000)
lassocv.fit(X, y)
lassocv.alpha_

optimal_lasso = skl_lm.Lasso()
optimal_lasso.set_params(alpha = lassocv.alpha_)
optimal_lasso.fit(X, y)
mean_squared_error(y, optimal_lasso.predict(X))

LassoCV(alphas=array([1.00000000e+10, 7.56463328e+09, 5.72236766e+09, 4.32876128e+09,
       3.27454916e+09, 2.47707636e+09, 1.87381742e+09, 1.41747416e+09,
       1.07226722e+09, 8.11130831e+08, 6.13590727e+08, 4.64158883e+08,
       3.51119173e+08, 2.65608778e+08, 2.00923300e+08, 1.51991108e+08,
       1.14975700e+08, 8.69749003e+07, 6.57933225e+07, 4.97702356e+07,
       3.76493581e+07, 2.84803587e+0...
       7.05480231e-02, 5.33669923e-02, 4.03701726e-02, 3.05385551e-02,
       2.31012970e-02, 1.74752840e-02, 1.32194115e-02, 1.00000000e-02]),
        copy_X=True, cv=KFold(n_splits=10, random_state=0, shuffle=True),
        eps=0.001, fit_intercept=True, max_iter=10000, n_alphas=100,
        n_jobs=None, normalize=False, positive=False, precompute='auto',
        random_state=None, selection='cyclic', tol=0.0001, verbose=False)

0.0932603346883218

Lasso(alpha=0.0932603346883218, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

Lasso(alpha=0.0932603346883218, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

1592.8220339844108

In [935]:
optimal_lasso.fit(X, y)
pd.Series(optimal_lasso.coef_.flatten(), index=X.columns)

Lasso(alpha=0.0932603346883218, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

years_experience                  19.194101
years_company                     -2.283048
Amazon                             3.224186
Microsoft                          1.382627
Google                             9.895421
Facebook                          35.633213
Capital One                        9.558577
Wayfair                           -0.000000
SAP                              -13.397239
Uber                              17.341016
Salesforce                        20.460964
Bloomberg                         20.686612
eBay                               2.166192
IBM                               -0.000000
Cisco                             -0.000000
Oracle                             1.533539
Epic Systems                      12.444529
Yahoo                              9.239756
Yelp                             -20.025461
Apple                             13.883558
JPMorgan Chase                    -0.000000
Adobe                             -0.000000
Goldman Sachs                   

## Principal Components Regression

In [945]:
pca = PCA()
X_reduced = pca.fit_transform(X)

# We will use OLS to fit our M-dimensional data, derived from PCA
regr = skl_lm.LinearRegression()

# 10-fold CV, with shuffle
kf_10 = KFold(n_splits=10, shuffle=True, random_state=0)
mse = []

for i in np.arange(1, 66):
    score = -1*cross_val_score(regr, X_reduced[:,:i], y, cv=kf_10, scoring='neg_mean_squared_error').mean()
    mse.append(score)
    
mse_per_component = pd.Series(np.array(mse).flatten(), index = np.arange(1,66))
print(mse_per_component)

1     2.597939e+03
2     2.407501e+03
3     2.383608e+03
4     2.338259e+03
5     2.343758e+03
6     2.354691e+03
7     2.360462e+03
8     2.355938e+03
9     2.327808e+03
10    2.269041e+03
11    2.265997e+03
12    2.252414e+03
13    2.201857e+03
14    2.216762e+03
15    2.219001e+03
16    2.226711e+03
17    2.244052e+03
18    2.228964e+03
19    2.036689e+03
20    2.039100e+03
21    2.048328e+03
22    2.041340e+03
23    2.046587e+03
24    2.054440e+03
25    2.055631e+03
26    2.049421e+03
27    2.053574e+03
28    2.028854e+03
29    2.028414e+03
30    2.029876e+03
31    2.025804e+03
32    2.026142e+03
33    2.026725e+03
34    2.024425e+03
35    2.030452e+03
36    2.016499e+03
37    2.016892e+03
38    1.996968e+03
39    1.963054e+03
40    1.954481e+03
41    1.952665e+03
42    1.956690e+03
43    1.958662e+03
44    1.946023e+03
45    1.931974e+03
46    1.918459e+03
47    1.916606e+03
48    1.913391e+03
49    1.916474e+03
50    1.922657e+03
51    1.924920e+03
52    1.935904e+03
53    1.9401

In [946]:
np.amin(mse_per_component)

1913.3914309074426

In [947]:
# Min is for 48
X_reduced = pca.fit_transform(X)[:, :48]

# train OLS model on PCA-reduced training data
pca_regr = skl_lm.LinearRegression()
pca_regr.fit(X_reduced[:,:48], y)

# Make predictions with our model and calculate MSE
mean_squared_error(y, pca_regr.predict(X_reduced))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

1638.2210014939155

## PLS

In [953]:
# We will use OLS to fit our M-dimensional data, derived from PLS
regr = skl_lm.LinearRegression()

# 10-fold CV, with shuffle
kf_10 = KFold(n_splits=10, shuffle=True, random_state=0)
mse = []

for i in np.arange(1, 66):
    pls=PLSRegression(n_components=i)
    score = -1*cross_val_score(pls, X.iloc[:,:i], y, cv=kf_10, scoring='neg_mean_squared_error').mean()
    mse.append(score)
    
mse_per_component = pd.Series(np.array(mse).flatten(), index = np.arange(1, 66))
print(mse_per_component)



1     2.442051e+03
2     2.424024e+03
3     2.435005e+03
4     2.439172e+03
5     2.425847e+03
6     2.354259e+03
7     2.353475e+03
8     2.351663e+03
9     2.334610e+03
10    2.282625e+03
11    2.266851e+03
12    2.242682e+03
13    2.223340e+03
14    2.226917e+03
15    2.227991e+03
16    2.236735e+03
17    2.242932e+03
18    2.221784e+03
19    2.217018e+03
20    2.221189e+03
21    2.232205e+03
22    2.265653e+03
23    2.320312e+03
24    2.513343e+03
25    2.364147e+03
26    2.171038e+03
27    2.156297e+03
28    2.122215e+03
29    1.999359e+03
30    1.965434e+03
31    1.964894e+03
32    1.922037e+03
33    1.928343e+03
34    1.927353e+03
35    1.902043e+03
36    1.897704e+03
37    1.896359e+03
38    1.897653e+03
39    1.879714e+03
40    2.173882e+03
41    1.880216e+03
42    1.880111e+03
43    6.791896e+04
44    1.873866e+03
45    1.869363e+03
46    1.869394e+03
47    1.860576e+03
48    1.868063e+03
49    1.861136e+03
50    1.872877e+03
51    2.240835e+05
52    2.075066e+05
53    2.4283



In [954]:
np.amin(mse_per_component)

1860.5764278680963

In [955]:
# Min is 47 components
pls = PLSRegression(n_components=47, scale=False)
pls.fit(X, y)
pls.coef_

mean_squared_error(y, pls.predict(X))

PLSRegression(copy=True, max_iter=500, n_components=47, scale=False, tol=1e-06)

array([[ 19.45868489],
       [ -2.50343663],
       [  5.89579581],
       [  2.71916792],
       [ 13.04707064],
       [ 37.3032465 ],
       [ 14.25942464],
       [-11.27271821],
       [-16.62893484],
       [ 21.83704613],
       [ 28.67094713],
       [ 26.1434208 ],
       [  5.75962651],
       [ -3.52147734],
       [ -4.19494973],
       [  9.37452728],
       [ 18.69151703],
       [ 16.68619151],
       [-27.98758028],
       [ 24.01860422],
       [ -0.72727903],
       [ -1.6766882 ],
       [ -6.42041962],
       [ -1.94303592],
       [ 29.88808565],
       [ 64.42697045],
       [ 38.38238319],
       [ 54.72591372],
       [ 52.56326802],
       [ 33.56310796],
       [ 24.25879222],
       [ 42.11499451],
       [  1.15357185],
       [ -4.86103042],
       [ 39.97487016],
       [ 36.62252933],
       [-22.23554875],
       [ 12.13918165],
       [ 35.75911475],
       [ 25.40631977],
       [  0.85746956],
       [ 22.23798138],
       [-24.49893496],
       [ 37

1563.0115469520958

## Suggested Negotiation Values

In [962]:
comp_test = pd.read_csv('test.csv')
comp_test = comp_tidy_cols(comp_test)
comp_test = comp_fill_vals(comp_test)
comp_test, _ = comp_create_dummies(comp, comp_test)
comp_test = comp_scale_dollars(comp_test)
X_test, y_test = comp_scale_regr(comp, comp_test, 'salary')

print(dummies['d_tags'])
X_test = X_test.join(dummies['d_tags'].index)
X_test

#y_test_ridge = optimal_ridge.predict(X_test)
#y_test_lasso = optimal_lasso.predict(X_test)
#y_test_pca = pca_regr.predict(X_test)
#y_test_pls = pls.predict(X_test)

#print([y_test_ridge, y_test_lasso, y_test_pca, y_test_pls])

Full Stack                        131
Distributed Systems (Back-End)    125
API Development (Back-End)         76
Web Development (Front-End)        36
ML / AI                            32
iOS                                16
DevOps                             16
Networking                         11
Testing (SDET)                     11
Android                             9
Site Reliability (SRE)              8
Security                            8
Mobile (iOS + Android)              6
Name: tag, dtype: int64


AttributeError: 'builtin_function_or_method' object has no attribute 'is_unique'