# Levels.fyi Software Engineer Compensation

In [3201]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3202]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import itertools

import sklearn.linear_model as skl_lm
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge, RidgeCV

import statsmodels.api as sm
import matplotlib.pyplot as plt

import seaborn as sns

from patsy import dmatrix

pd.set_option('display.max_rows', 100)
plt.rcParams["figure.figsize"] = (20,10)

In [3203]:
comp = pd.read_csv('compensation.csv')
comp.dtypes
comp.head(10)

Date                          object
Company                       object
Title                         object
Level                         object
Standard Level                object
Skill Index                  float64
Location                      object
Total Yearly Compensation      int64
Base Salary (/year)          float64
Stock Grant (/year)          float64
Bonus (/year)                float64
Years of Experience            int64
Years at Company             float64
Tag                           object
Gender                        object
Other Details                 object
dtype: object

Unnamed: 0,Date,Company,Title,Level,Standard Level,Skill Index,Location,Total Yearly Compensation,Base Salary (/year),Stock Grant (/year),Bonus (/year),Years of Experience,Years at Company,Tag,Gender,Other Details
0,2/5/2020,JPMorgan Chase,Software Engineer,Senior Associate,Software Engineer,28.57,"London, EN, United Kingdom",87000,78000.0,,9000.0,3,1.0,Full Stack,Male,
1,2/5/2020,TripAdvisor,Software Engineer,Principal Software Engineer,,,"Oxford, EN, United Kingdom",212000,121000.0,70000.0,21000.0,17,4.0,Distributed Systems (Back-End),Male,
2,2/5/2020,Salesforce,Software Engineer,Lead MTS,Senior Engineer,61.0,"San Francisco, CA",275000,200000.0,45000.0,30000.0,14,1.0,Testing (SDET),Male,
3,2/5/2020,Adobe,Software Engineer,Senior Software Engineer (5),Senior Engineer,44.55,"San Jose, CA",270000,190000.0,60000.0,20000.0,12,1.0,Distributed Systems (Back-End),,
4,2/5/2020,Rokt,Software Engineer,L3,,,"Sydney, NS, Australia",133000,115000.0,10000.0,8000.0,2,1.0,Web Development (Front-End),Male,
5,2/5/2020,Turbonomic,Software Engineer,Senior,,,"New York, NY",180000,155000.0,,25000.0,5,2.0,Web Development (Front-End),Male,Masters
6,2/5/2020,Salesforce,Software Engineer,Senior MTS,Senior Engineer,44.5,"San Francisco, CA",192000,177000.0,,15000.0,12,4.0,Full Stack,Male,Masters
7,2/5/2020,Uber,Software Engineer,Senior Software Engineer,Senior Engineer,50.0,"San Francisco, CA",250000,162000.0,72000.0,16000.0,4,4.0,Distributed Systems (Back-End),Male,
8,2/5/2020,Amazon,Software Engineer,SDE I,Entry Level Engineer,10.87,"Seattle, WA",158000,112000.0,20000.0,26000.0,0,0.0,Distributed Systems (Back-End),Female,Masters
9,2/5/2020,Amazon,Software Engineer,Principal SDE,Staff Engineer,75.0,"Toronto, ON, Canada",343000,165000.0,178000.0,0.0,36,2.0,Distributed Systems (Back-End),Male,


In [3204]:
X_train, X_test, y_train, y_test = train_test_split(comp, comp['Date'], test_size=0.2, random_state=0)
comp = X_train

## Data Cleaning

In [3205]:
def comp_tidy_cols(comp):
    # Make column names nicer
    comp.columns = map(str.lower, comp.columns.str.replace(' ', '_'))
    comp.rename(columns={
        'total_yearly_compensation': 'total_comp',
        'base_salary_(/year)': 'salary',
        'stock_grant_(/year)': 'stock',
        'bonus_(/year)': 'bonus',
        'years_of_experience': 'years_experience',
        'years_at_company': 'years_company',
    }, inplace=True)

    # Title: All entries is "Software Engineer" so this isn't particularly useful info 
    # Standard Level & Skill Index: Columns are determined by Levels.fyi; preferable not to confound results based on blackbox algorithm
    # Date: Unlikely to do time-series analysis, so will simply drop date at this stage
    comp.drop(columns=['standard_level', 'skill_index', 'date', 'title'], inplace=True, errors='ignore')

    return comp
    
comp = comp_tidy_cols(comp)
comp.head()

Unnamed: 0,company,level,location,total_comp,salary,stock,bonus,years_experience,years_company,tag,gender,other_details
107,Capital One,Associate Software Eng,"Washington, DC",108000,99000.0,,9000.0,1,0.0,Distributed Systems (Back-End),Male,
428,Google,L3,"Mountain View, CA",244000,122000.0,88000.0,34000.0,3,3.0,API Development (Back-End),Female,
71,Oracle,IC-3,"Redwood City, CA",185000,145000.0,40000.0,0.0,12,2.0,API Development (Back-End),,
473,IBM,Associate Engineer,"Washington, DC",100000,,,,1,1.0,Full Stack,,
6,Salesforce,Senior MTS,"San Francisco, CA",192000,177000.0,,15000.0,12,4.0,Full Stack,Male,Masters


In [3206]:
comp.isna().sum(axis=0)

company               0
level                 0
location              0
total_comp            0
salary               19
stock               127
bonus                26
years_experience      0
years_company         0
tag                   0
gender               81
other_details       272
dtype: int64

In [3207]:
def comp_fill_vals(comp):
    if 'salary' in comp:
        comp.salary.replace(to_replace = np.NaN, value = comp.total_comp, inplace = True)
    else:
        comp.salary = 0        
        
    if 'stock' in comp:
        comp.stock.replace(to_replace = np.NaN, value = 0.0, inplace = True)
    else:
        comp.stock = 0
        
    if 'bonus' in comp:
        comp.bonus.replace(to_replace = np.NaN, value = 0.0, inplace = True)
    else:
        comp.bonus = 0
        
    if 'gender' in comp:
        comp.gender.replace(to_replace = np.NaN, value = 'Unknown', inplace = True)
    else:
        comp.gender = 'Unknown'
        
    if 'other_details' in comp:
        comp.other_details.replace(to_replace = np.NaN, value = '', inplace = True)
    else:
        comp.other_details = ''

    return comp

comp = comp_fill_vals(comp)
comp.isna().sum(axis=0)

company             0
level               0
location            0
total_comp          0
salary              0
stock               0
bonus               0
years_experience    0
years_company       0
tag                 0
gender              0
other_details       0
dtype: int64

In [3208]:
def comp_scale_dollars(comp):
    # Scale down dollar values for clarity, so they appear in the thousands
    comp_total_comp = comp.total_comp / 1000
    comp_salary = comp.salary / 1000
    comp_stock = comp.stock / 1000
    comp_bonus = comp.bonus / 1000
    
    comp.total_comp = comp.total_comp / 1000
    comp.salary = comp.salary / 1000
    comp.stock = comp.stock / 1000
    comp.bonus = comp.bonus / 1000
    
    return comp, comp_total_comp, comp_salary, comp_stock, comp_bonus

comp, y_total_comp, y_salary, y_stock, y_bonus = comp_scale_dollars(comp)
comp_clean = comp.copy(deep=True)
comp

Unnamed: 0,company,level,location,total_comp,salary,stock,bonus,years_experience,years_company,tag,gender,other_details
107,Capital One,Associate Software Eng,"Washington, DC",108.0,99.0,0.0,9.0,1,0.0,Distributed Systems (Back-End),Male,
428,Google,L3,"Mountain View, CA",244.0,122.0,88.0,34.0,3,3.0,API Development (Back-End),Female,
71,Oracle,IC-3,"Redwood City, CA",185.0,145.0,40.0,0.0,12,2.0,API Development (Back-End),Unknown,
473,IBM,Associate Engineer,"Washington, DC",100.0,100.0,0.0,0.0,1,1.0,Full Stack,Unknown,
6,Salesforce,Senior MTS,"San Francisco, CA",192.0,177.0,0.0,15.0,12,4.0,Full Stack,Male,Masters
...,...,...,...,...,...,...,...,...,...,...,...,...
323,Google,L3,"Chicago, IL",164.0,110.0,37.0,17.0,5,1.0,Distributed Systems (Back-End),Male,
192,Uber,Senior Software Engineer II,"San Francisco, CA",595.0,250.0,310.0,35.0,8,2.0,Distributed Systems (Back-End),Male,Competing offers
117,IHS Markit,L1.5,"Boulder, CO",90.0,85.0,0.0,5.0,3,0.0,Full Stack,Female,
47,CGI,L3,"Washington, DC",110.0,100.0,10.0,0.0,3,3.0,Full Stack,Unknown,Masters


In [3209]:
#sns.pairplot(comp[['total_comp', 'salary', 'stock', 'bonus', 'years_experience', 'years_company']])

In [3210]:
def comp_create_dummies(comp, stringent=True, interaction=True):
    if interaction:
        # Note! This create dummies for only those combinations that exist in the data, not all combinations!
        comp["company_level"] = comp["company"] + "_" + comp["level"]
        dummies_company_levels = dmatrix("0 + company_level", comp, return_type='dataframe')
        comp = pd.concat([comp, dummies_company_levels], axis=1)

        comp["company_location"] = comp["company"] + "_" + comp["location"]
        dummies_company_location = dmatrix("company_location", comp, return_type='dataframe')
        comp = pd.concat([comp, dummies_company_location], axis=1)
    else:
        # Heuristically select the companies with 5 or more data points
        dummies_companies = pd.get_dummies(comp.company)
        filtered_companies = comp.company.value_counts()
        if stringent:
            filtered_companies = filtered_companies[filtered_companies >= 5]
        comp = comp.join(dummies_companies[filtered_companies.index], rsuffix='_company')
        
        # Heuristically select the level with 5 or more data points
        dummies_levels = pd.get_dummies(comp.level)
        filtered_levels = comp.level.value_counts()
        if stringent:
            filtered_levels = filtered_levels[filtered_levels >= 5]
        comp = comp.join(dummies_levels[filtered_levels.index], rsuffix='_level')

    # Heuristically select the locations with 5 or more data points
    dummies_locations = pd.get_dummies(comp.location)
    filtered_locations = comp.location.value_counts()
    if stringent:
        filtered_locations = filtered_locations[filtered_locations >= 5]
    comp = comp.join(dummies_locations[filtered_locations.index], rsuffix='_location')
        
    # Heuristically select those tags with more than 1 data points
    dummies_tags = pd.get_dummies(comp.tag)
    filtered_tags = comp.tag.value_counts()
    if stringent:
        filtered_tags = filtered_tags[filtered_tags >= 2]
    comp = comp.join(dummies_tags[filtered_tags.index], rsuffix='_tag')

    # Use all gender values
    dummies_genders = pd.get_dummies(comp[['gender']])
    comp = pd.concat([comp, dummies_genders], axis=1)

    # Manually create dummies based on other details
    comp['masters'] = comp.other_details.str.lower().str.contains('master').factorize()[0]
    comp['phd'] = comp.other_details.str.lower().str.contains('phd').factorize()[0]
        
    return comp

comp = comp_create_dummies(comp)
comp.head(15)

Unnamed: 0,company,level,location,total_comp,salary,stock,bonus,years_experience,years_company,tag,...,Android,Security,Site Reliability (SRE),Mobile (iOS + Android),gender_Female,gender_Male,gender_Other,gender_Unknown,masters,phd
107,Capital One,Associate Software Eng,"Washington, DC",108.0,99.0,0.0,9.0,1,0.0,Distributed Systems (Back-End),...,0,0,0,0,0,1,0,0,0,0
428,Google,L3,"Mountain View, CA",244.0,122.0,88.0,34.0,3,3.0,API Development (Back-End),...,0,0,0,0,1,0,0,0,0,0
71,Oracle,IC-3,"Redwood City, CA",185.0,145.0,40.0,0.0,12,2.0,API Development (Back-End),...,0,0,0,0,0,0,0,1,0,0
473,IBM,Associate Engineer,"Washington, DC",100.0,100.0,0.0,0.0,1,1.0,Full Stack,...,0,0,0,0,0,0,0,1,0,0
6,Salesforce,Senior MTS,"San Francisco, CA",192.0,177.0,0.0,15.0,12,4.0,Full Stack,...,0,0,0,0,0,1,0,0,1,0
411,Microsoft,64,"Bellevue, WA",250.0,160.0,60.0,30.0,10,8.0,Distributed Systems (Back-End),...,0,0,0,0,0,1,0,0,0,0
113,IBM,Associate Engineer,"Rochester, MN",86.0,86.0,0.0,0.0,2,2.0,DevOps,...,0,0,0,0,0,1,0,0,0,0
236,Oracle,IC-3,"Brno, JM, Czech Republic",37.0,30.0,7.0,0.0,4,2.0,Web Development (Front-End),...,0,0,0,0,0,1,0,0,1,0
299,Apple,ICT3,"Cupertino, CA",169.0,130.0,26.0,13.0,3,0.0,Testing (SDET),...,0,0,0,0,1,0,0,0,1,0
155,Google,L5,"Kirkland, WA",397.0,182.0,180.0,34.0,17,10.0,Distributed Systems (Back-End),...,0,0,0,0,0,1,0,0,1,0


In [3211]:
def comp_drop_cols(comp):
    comp.drop(
        columns=[
            'salary', 'stock', 'bonus', 'total_comp',
            'company', 'level', 'location', 'company_level', 'company_location', 'tag', 'gender', 'other_details'],
        inplace=True,
        errors='ignore')
    comp = comp.astype('float')
    return comp

comp = comp_drop_cols(comp)

In [3212]:
base_comp_cols = comp.iloc[:,:2]
base_comp_remaining = comp.iloc[:,2:]
base_comp_scaler = StandardScaler().fit(base_comp_cols)
# base_comp_scaler = PowerTransformer(method='yeo-johnson').fit(base_comp_cols)
# base_comp_scaler = RobustScaler().fit(base_comp_cols)

# Make sure to scale years of experience and years at company
def comp_scale_regr(comp):    
    scale_comp_cols = comp.iloc[:,:2]
    scale_comp_remaining = comp.iloc[:,2:]
    scaled_comp = pd.DataFrame(
        base_comp_scaler.transform(scale_comp_cols),
        index = scale_comp_cols.index,
        columns = scale_comp_cols.columns)
    scaled_comp = scaled_comp.join(scale_comp_remaining).astype('float')
    return scaled_comp

comp = comp_scale_regr(comp)
comp

Unnamed: 0,years_experience,years_company,company_level[3M_T3],company_level[ABC_3],company_level[ADF_62],company_level[AT&T_Professional Software-Engineer],company_level[AWL_Principal],company_level[Accenture_Software Engineer Analyst],company_level[AcuityAds_Senior Data Engineer],company_level[Adobe_SDEP40],...,Android,Security,Site Reliability (SRE),Mobile (iOS + Android),gender_Female,gender_Male,gender_Other,gender_Unknown,masters,phd
107,-1.019051,-0.864185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
428,-0.649581,0.143020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
71,1.013032,-0.192715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
473,-1.019051,-0.528450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,1.013032,0.478754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,-0.280112,-0.528450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
192,0.274093,-0.192715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
117,-0.649581,-0.864185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
47,-0.649581,0.143020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


## Model Selection

In [3213]:
X = comp.copy(deep=True)
num_features = X.columns.size

alphas = 10**np.linspace(10,-2,100)
kf_10 = KFold(n_splits=10, shuffle=True, random_state=0)

def ridge(X, y):
    ridgecv = skl_lm.RidgeCV(alphas=alphas, cv=kf_10, scoring='neg_mean_squared_error')
    ridgecv.fit(X, y)

    optimal_ridge = skl_lm.Ridge()
    optimal_ridge.set_params(alpha=ridgecv.alpha_)
    optimal_ridge.fit(X, y)

    coefs = pd.Series(optimal_ridge.coef_.flatten(), index=X.columns)
    mse = mean_squared_error(y, optimal_ridge.predict(X))
    
    return {
        'model_type': 'ridge',
        'model': optimal_ridge,
        'intercept': optimal_ridge.intercept_,
        'coefs': coefs,
        'mse': mse }

def lasso(X, y):
    lassocv = skl_lm.LassoCV(alphas = alphas, cv=kf_10, max_iter=10000)
    lassocv.fit(X, y)

    optimal_lasso = skl_lm.Lasso()
    optimal_lasso.set_params(alpha = lassocv.alpha_)
    optimal_lasso.fit(X, y)

    coefs = pd.Series(optimal_lasso.coef_.flatten(), index=X.columns)
    mse = mean_squared_error(y, optimal_lasso.predict(X))
    
    return {
        'model_type': 'lasso',
        'model': optimal_lasso,
        'intercept': optimal_lasso.intercept_,
        'coefs': coefs,
        'mse': mse }

def pca(X, y):
    pca = PCA()
    X_reduced = pca.fit_transform(X)
    regr = skl_lm.LinearRegression()
    mse = []
    for i in np.arange(1, num_features):
        score = -1*cross_val_score(regr, X_reduced[:,:i], y, cv=kf_10, scoring='neg_mean_squared_error').mean()
        mse.append(score)
    mse_per_component = pd.Series(np.array(mse).flatten(), index = np.arange(1, num_features))
    min_component = np.argmin(mse_per_component) + 1

    X_reduced = pca.fit_transform(X)[:, :min_component]
    pca_regr = skl_lm.LinearRegression()
    pca_regr.fit(X_reduced[:,:min_component], y)

    coefs = pca_regr.coef_.flatten()
    mse = mean_squared_error(y, pca_regr.predict(X_reduced))
    
    return {
        'model_type': 'pca',
        'model': pca_regr,
        'intercept': pca_regr.intercept_,
        'coefs': coefs,
        'mse': mse,
        'pca': pca,
        'min_component': min_component }

def pls(X, y):
    regr = skl_lm.LinearRegression()
    num_components = 7 # num_features
    mse = []
    for i in np.arange(1, num_components):
        pls=PLSRegression(n_components=i)
        score = -1*cross_val_score(pls, X.iloc[:,:i], y, cv=kf_10, scoring='neg_mean_squared_error').mean()
        mse.append(score)
    mse_per_component = pd.Series(np.array(mse).flatten(), index = np.arange(1, num_components))
    min_component = np.argmin(mse_per_component) + 1

    pls = PLSRegression(n_components=min_component, scale=False)
    pls.fit(X, y)

    coefs = pd.Series(pls.coef_.flatten(), index=X.columns)
    mse = mean_squared_error(y, pls.predict(X))
    
    return {
        'model_type': 'pls',
        'model': pls,
        'intercept': None,
        'coefs': coefs,
        'mse': mse,
        'min_component': min_component }

def get_model_type_min_mse(models):
    min_model_type = None
    for index, (model_type, model) in enumerate(models.items()):            
        if index == 0 or model['mse'] < models[min_model_type]['mse']:
            min_model_type = model_type
    return min_model_type

def perform_model_selection(X, y, identifier):
    models = {
        'ridge': ridge(X, y),
        'lasso': lasso(X, y),
        #'pca': pca(X, y),
        #'pls': pls(X, y),
    }
    
    print(f'Performing model selection on {identifier}.')
    for model_type in models:
        print(f'{model_type} MSE:\t', models[model_type]['mse'])

    best_model_type = get_model_type_min_mse(models)
    best_model = models[best_model_type]
    print(f'Best model is {best_model_type}.')
    
    print('\nIntercept:\t\t', best_model['intercept'], '\n')
    print(best_model['coefs'])
    
    return best_model, models

## Total Compensation Model Selection

In [3214]:
best_model_total_comp, all_models_total_comp = perform_model_selection(X, y_total_comp, 'total compensation')

Performing model selection on total compensation.
ridge MSE:	 652.7968205067943
lasso MSE:	 1613.95218745297
Best model is ridge.

Intercept:		 173.85855412157733 

years_experience         30.764866
years_company            -2.827456
company_level[3M_T3]    -57.654256
company_level[ABC_3]    -28.513928
company_level[ADF_62]   -17.474146
                           ...    
gender_Male               5.689852
gender_Other             -6.746884
gender_Unknown           18.320921
masters                   5.827426
phd                      52.627395
Length: 527, dtype: float64


## Salary Model Selection

In [3215]:
best_model_salary, all_models_salary = perform_model_selection(X, y_salary, 'salary')

Performing model selection on salary.
ridge MSE:	 499.3258404286523
lasso MSE:	 1239.127650205991
Best model is ridge.

Intercept:		 116.36906941264328 

years_experience         16.025639
years_company            -1.381835
company_level[3M_T3]    -12.303265
company_level[ABC_3]    -10.435672
company_level[ADF_62]   -13.739046
                           ...    
gender_Male              -0.565188
gender_Other             -0.572582
gender_Unknown            7.538337
masters                  -0.958298
phd                      16.739108
Length: 527, dtype: float64


## Stock Model Selection

In [3216]:
best_model_stock, all_models_stock = perform_model_selection(X, y_stock, 'stock')

Performing model selection on stock.
ridge MSE:	 2755.9268937280654
lasso MSE:	 1320.689322612665
Best model is lasso.

Intercept:		 24.439550808770775 

years_experience         12.329272
years_company            -4.165111
company_level[3M_T3]     -0.000000
company_level[ABC_3]      0.000000
company_level[ADF_62]     0.000000
                           ...    
gender_Male               0.000000
gender_Other             -0.000000
gender_Unknown            3.227384
masters                   1.060850
phd                      11.877584
Length: 527, dtype: float64


## Bonus Model Selection

In [3217]:
best_model_bonus, all_models_bonus = perform_model_selection(X, y_bonus, 'bonus')

Performing model selection on bonus.
ridge MSE:	 123.01490036775395
lasso MSE:	 152.75909514658306
Best model is ridge.

Intercept:		 13.6903709365881 

years_experience         3.290821
years_company           -0.866581
company_level[3M_T3]    -0.747009
company_level[ABC_3]     2.071348
company_level[ADF_62]   -0.055450
                           ...   
gender_Male             -0.223481
gender_Other            -0.357281
gender_Unknown           1.711786
masters                  1.570382
phd                      0.819400
Length: 527, dtype: float64


## Suggested Negotiation Values

In [3220]:
comp_test = pd.read_csv('test.csv')
comp_test = comp_tidy_cols(comp_test)
comp_test = comp_fill_vals(comp_test)
comp_test_orig = comp_test.copy(deep=True)

# No need to scale salary/stock/bonus, as that's what we're predicting
comp_test = comp_create_dummies(comp_test, stringent=False)
comp_test = comp_drop_cols(comp_test)
comp_test = comp_scale_regr(comp_test)

X_test = comp.align(comp_test, join='left', axis=1)[1] # Ensure alignment with training data
X_test = X_test.fillna(0.0)
X_test

Unnamed: 0,years_experience,years_company,company_level[3M_T3],company_level[ABC_3],company_level[ADF_62],company_level[AT&T_Professional Software-Engineer],company_level[AWL_Principal],company_level[Accenture_Software Engineer Analyst],company_level[AcuityAds_Senior Data Engineer],company_level[Adobe_SDEP40],...,Android,Security,Site Reliability (SRE),Mobile (iOS + Android),gender_Female,gender_Male,gender_Other,gender_Unknown,masters,phd
90,-1.203785,-0.52845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
254,-1.203785,-0.864185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
283,-1.019051,-0.52845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
444,-1.019051,-0.192715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
474,-0.464846,-0.864185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15,-0.834316,-0.192715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
316,-0.834316,-0.192715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
488,-0.280112,-0.830611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
159,0.089358,1.485959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
153,2.49091,-0.192715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [3222]:
# Total compensation predictions
y_pred_total_comp = None
if best_model_total_comp['model_type'] == 'pca':
    pca_test = best_model_total_comp['pca']
    min_component = best_model_total_comp['min_component']
    X_test_pca = pca_test.transform(X_test)[:,:min_component]
    y_pred_total_comp = best_model_total_comp['model'].predict(X_test_pca).flatten()
else:
    y_pred_total_comp = best_model_total_comp['model'].predict(X_test).flatten()
y_pred_total_comp[y_pred_total_comp < 0] = 0

# Salary predictions
y_pred_salary = None
if best_model_salary['model_type'] == 'pca':
    pca_test = best_model_salary['pca']
    min_component = best_model_salary['min_component']
    X_test_pca = pca_test.transform(X_test)[:,:min_component]
    y_pred_salary = best_model_salary['model'].predict(X_test_pca).flatten()
else:
    y_pred_salary = best_model_salary['model'].predict(X_test).flatten()
y_pred_salary[y_pred_salary < 0] = 0

# Stock predictions
y_pred_stock = None
if best_model_stock['model_type'] == 'pca':
    pca_test = best_model_stock['pca']
    min_component = best_model_stock['min_component']
    X_test_pca = pca_test.transform(X_test)[:,:min_component]
    y_pred_stock = best_model_stock['model'].predict(X_test_pca).flatten()
else:
    y_pred_stock = best_model_stock['model'].predict(X_test).flatten()
y_pred_stock[y_pred_stock < 0] = 0

# Bonus predictions
y_pred_bonus = None
if best_model_bonus['model_type'] == 'pca':
    pca_test = best_model_bonus['pca']
    min_component = best_model_bonus['min_component']
    X_test_pca = pca_test.transform(X_test)[:,:min_component]
    y_pred_bonus = best_model_bonus['model'].predict(X_test_pca).flatten()
else:
    y_pred_bonus = best_model_bonus['model'].predict(X_test).flatten()
y_pred_bonus[y_pred_bonus < 0] = 0

# All prediction results
y_pred = pd.DataFrame(data={
    'predicted_salary': y_pred_salary,
    'predicted_stock': y_pred_stock,
    'predicted_bonus': y_pred_bonus,
    'sum_total_comp': y_pred_salary + y_pred_stock + y_pred_bonus,
    'predicted_total_comp': y_pred_total_comp,
})

predictions = pd.concat([y_test, comp_test_orig], axis=1)
predictions

ridge
19490.964679718894
lasso
18616.520574387127


In [None]:
coefs = best_model_salary['coefs']

coefs_increase = best_model_salary['coefs'][best_model_salary['coefs'] > 20]
coefs_increase = coefs_increase.sort_values()
x = plt.bar(coefs_increase.index, coefs_increase)
x = plt.xticks(rotation=90)

In [None]:
coefs_decrease = best_model_salary['coefs'][best_model_salary['coefs'] < -20]
coefs_decrease = coefs_decrease.sort_values()
x = plt.bar(coefs_decrease.index, coefs_decrease)
x = plt.xticks(rotation=90)
x = plt.gca().invert_yaxis()

In [None]:
coefs.filter(like='Google').sort_values().sort_index()
coefs.filter(like='Facebook').sort_values().sort_index()

In [None]:
coefs.filter(like='San Fran').sort_values()
coefs.filter(like='NY').sort_values()