## Compensation Data Analysis - Untransformed Predictions

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn.linear_model as skl_lm
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge, RidgeCV
from sklearn.cross_decomposition import PLSRegression

import statsmodels.api as sm

import seaborn as sns

pd.set_option('display.max_rows', 100)
plt.rcParams["figure.figsize"] = (20,10)
plt.rcParams["font.weight"] = "bold"
plt.style.use('ggplot')

In [3]:
comp = pd.read_csv('data/prepped_comp_data_interactions.csv', index_col='entry_id')
comp

Unnamed: 0_level_0,log_total_comp,log_salary,log_stock,log_bonus,scaled_years_experience,scaled_years_company,company[AT&T],company[Accenture],company[Adobe],company[Airbnb],...,company_location[WeWork_NYC Area],company_location[WeWork_SF Bay Area],company_location[Workday_SF Bay Area],company_location[Yahoo_SF Bay Area],"company_location[Yandex_Moscow, MC, Russia]",company_location[Yelp_SF Bay Area],company_location[Zillow_SF Bay Area],company_location[Zillow_Seattle Area],company_location[eBay_SF Bay Area],company_location[eBay_Seattle Area]
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,6.359576,5.192962,5.959718,2.197336,0.712125,-0.123292,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.153297,4.787500,-6.907755,3.970311,0.896446,-0.467863,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,5.247029,4.700489,4.382039,-6.907755,-0.578121,0.221278,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,5.049862,4.905282,2.079567,2.565026,-0.393800,0.565848,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,5.303310,5.056252,3.258135,3.332240,1.080767,1.254988,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30133,4.744941,4.744941,-6.907755,-6.907755,-0.025158,-0.467863,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30139,5.913506,5.231114,5.010642,3.496538,0.712125,1.254988,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30140,5.356591,5.176155,-6.907755,3.555377,-0.762442,-0.812433,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30142,5.220361,4.584978,4.043069,3.401231,-0.393800,-0.123292,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
RANDOM_STATE = 721
X_train, X_test, _, _ = train_test_split(comp, comp, test_size=0.1, random_state=RANDOM_STATE)

y_total_comp = np.exp(X_train.log_total_comp)
y_salary = np.exp(X_train.log_salary)
y_stock = np.exp(X_train.log_stock)
y_bonus = np.exp(X_train.log_bonus)
X = X_train.drop(columns=['log_total_comp', 'log_salary', 'log_stock', 'log_bonus'])

y_test_total_comp = np.exp(X_test.log_total_comp)
y_test_salary = np.exp(X_test.log_salary)
y_test_stock = np.exp(X_test.log_stock)
y_test_bonus = np.exp(X_test.log_bonus)
X_test = X_test.drop(columns=['log_total_comp', 'log_salary', 'log_stock', 'log_bonus'])

X
X_test

Unnamed: 0_level_0,scaled_years_experience,scaled_years_company,company[AT&T],company[Accenture],company[Adobe],company[Airbnb],company[Amazon],company[American Express],company[Andela],company[Apple],...,company_location[WeWork_NYC Area],company_location[WeWork_SF Bay Area],company_location[Workday_SF Bay Area],company_location[Yahoo_SF Bay Area],"company_location[Yandex_Moscow, MC, Russia]",company_location[Yelp_SF Bay Area],company_location[Zillow_SF Bay Area],company_location[Zillow_Seattle Area],company_location[eBay_SF Bay Area],company_location[eBay_Seattle Area]
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1042,0.343483,0.221278,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6488,-0.393800,0.565848,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21860,-0.762442,-0.123292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7900,-0.209479,0.910418,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4392,-0.393800,0.565848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28116,-0.762442,-0.123292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13550,-0.025158,-0.467863,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8130,0.527804,-0.123292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13210,-0.762442,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,scaled_years_experience,scaled_years_company,company[AT&T],company[Accenture],company[Adobe],company[Airbnb],company[Amazon],company[American Express],company[Andela],company[Apple],...,company_location[WeWork_NYC Area],company_location[WeWork_SF Bay Area],company_location[Workday_SF Bay Area],company_location[Yahoo_SF Bay Area],"company_location[Yandex_Moscow, MC, Russia]",company_location[Yelp_SF Bay Area],company_location[Zillow_SF Bay Area],company_location[Zillow_Seattle Area],company_location[eBay_SF Bay Area],company_location[eBay_Seattle Area]
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23789,-0.578121,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11706,0.712125,1.254988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
310,-0.578121,-0.467863,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,-1.131083,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5982,0.343483,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25211,-0.578121,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25710,-0.578121,-0.467863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13958,0.896446,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4446,0.343483,1.254988,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model Selection

In [5]:
num_features = X.columns.size

alphas = 10**np.linspace(10,-2,100)
kf = KFold(n_splits=5, shuffle=True, random_state=0)

def ridge(X, y):
    ridgecv = skl_lm.RidgeCV(alphas=alphas, cv=kf, scoring='neg_mean_squared_error')
    ridgecv.fit(X, y)

    optimal_ridge = skl_lm.Ridge()
    optimal_ridge.set_params(alpha=ridgecv.alpha_)
    optimal_ridge.fit(X, y)

    coefs = pd.Series(optimal_ridge.coef_.flatten(), index=X.columns)
    rmse = np.sqrt(mean_squared_error(y, optimal_ridge.predict(X)))
    
    return {
        'alpha': ridgecv.alpha_,
        'model_type': 'ridge',
        'model': optimal_ridge,
        'intercept': optimal_ridge.intercept_,
        'coefs': coefs,
        'rmse': rmse }

def lasso(X, y):
    lassocv = skl_lm.LassoCV(alphas = alphas, cv=kf, max_iter=10000)
    lassocv.fit(X, y)

    optimal_lasso = skl_lm.Lasso()
    optimal_lasso.set_params(alpha = lassocv.alpha_)
    optimal_lasso.fit(X, y)

    coefs = pd.Series(optimal_lasso.coef_.flatten(), index=X.columns)
    rmse = np.sqrt(mean_squared_error(y, optimal_lasso.predict(X)))
    
    return {
        'alpha': lassocv.alpha_,
        'model_type': 'lasso',
        'model': optimal_lasso,
        'intercept': optimal_lasso.intercept_,
        'coefs': coefs,
        'rmse': rmse }

def pls(X, y):
    regr = skl_lm.LinearRegression()
    num_components = 7 # num_features
    mse = []
    for i in np.arange(1, num_components):
        pls=PLSRegression(n_components=i)
        score = -1*cross_val_score(pls, X.iloc[:,:i], y, cv=kf, scoring='neg_mean_squared_error').mean()
        mse.append(score)
    mse_per_component = pd.Series(np.array(mse).flatten(), index = np.arange(1, num_components))
    min_component = np.argmin(mse_per_component) + 1

    pls = PLSRegression(n_components=min_component, scale=False)
    pls.fit(X, y)

    coefs = pd.Series(pls.coef_.flatten(), index=X.columns)
    rmse = np.sqrt(mean_squared_error(y, pls.predict(X)))
    
    return {
        'model_type': 'pls',
        'model': pls,
        'intercept': None,
        'coefs': coefs,
        'rmse': rmse,
        'min_component': min_component }

def get_model_type_min_mse(models):
    min_model_type = None
    for index, (model_type, model) in enumerate(models.items()):            
        if index == 0 or model['rmse'] < models[min_model_type]['rmse']:
            min_model_type = model_type
    return min_model_type

def perform_model_selection(X, y, identifier):
    models = {
        'ridge': ridge(X, y),
        'lasso': lasso(X, y),
        'pls': pls(X, y),
    }
    
    print(f'Performing model selection on {identifier}.')
    for model_type in models:
        print(f'{model_type} RMSE:\t', models[model_type]['rmse'])

    best_model_type = get_model_type_min_mse(models)
    best_model = models[best_model_type]
    print(f'Best model is {best_model_type}.')
    
    print('\nIntercept:\t\t', best_model['intercept'], '\n')
    print(best_model['coefs'])
    
    return best_model, models

## Salary Model Selection

In [6]:
best_model_salary, all_models_salary = perform_model_selection(X, y_salary, 'salary')

Performing model selection on salary.
ridge RMSE:	 22.499139992557215
lasso RMSE:	 23.378233143858502
pls RMSE:	 27.767391665618558
Best model is ridge.

Intercept:		 94.06305270179655 

scaled_years_experience                  14.257096
scaled_years_company                      0.199930
company[AT&T]                           -18.046669
company[Accenture]                      -31.057716
company[Adobe]                           -3.155002
                                           ...    
company_location[Yelp_SF Bay Area]       -0.240539
company_location[Zillow_SF Bay Area]      5.136890
company_location[Zillow_Seattle Area]     8.009172
company_location[eBay_SF Bay Area]      -10.492502
company_location[eBay_Seattle Area]      -2.987132
Length: 615, dtype: float64


## Stock Model Selection

In [None]:
best_model_stock, all_models_stock = perform_model_selection(X, y_stock, 'stock')

## Bonus Model Selection

In [None]:
best_model_bonus, all_models_bonus = perform_model_selection(X, y_bonus, 'bonus')

## Suggested Negotiation Values

In [None]:
def get_prediction_range(best_model, X_test, y_test):
    y_pred_rmse = best_model['rmse']
    
    y_pred = best_model['model'].predict(X_test).flatten()
    y_pred_lower_bound = y_pred - y_pred_rmse
    y_pred_upper_bound = y_pred + y_pred_rmse
    
    y_pred[y_pred < 0] = 0
    y_pred_lower_bound[y_pred_lower_bound < 0] = 0
    y_pred_upper_bound[y_pred_upper_bound < 0] = 0

    test_error = np.sqrt(mean_squared_error(y_test, y_pred))
    print(test_error)

    y_pred_range = pd.DataFrame(data={
        'pred_lower_bound': y_pred_lower_bound,
        'pred': y_pred,
        'pred_upper_bound': y_pred_upper_bound,
    }, index=X_test.index)
    
    return y_pred_range

In [None]:
y_pred_salary_range = get_prediction_range(best_model_salary, X_test, y_test_salary)
y_pred_salary_range

In [None]:
y_pred_stock_range = get_prediction_range(best_model_stock, X_test, y_test_stock)
y_pred_stock_range

In [None]:
y_pred_bonus_range = get_prediction_range(best_model_bonus, X_test, y_test_bonus)
y_pred_bonus_range

In [None]:
y_pred_total_comp_sum_range = y_pred_salary_range + y_pred_stock_range + y_pred_bonus_range
np.sqrt(mean_squared_error(y_test_total_comp, y_pred_total_comp_sum_range.pred))
y_pred_total_comp_sum_range

In [None]:
best_model_salary['alpha']
best_model_stock['alpha']
best_model_bonus['alpha']

## Analysis

In [None]:
def plot_coefs(coefs, condition = None):
    if condition is None:
        coefs_filtered = coefs
    else:
        coefs_filtered = coefs[condition]
        
    coefs_filtered = coefs_filtered.sort_values()
    plt.bar(coefs_filtered.index, coefs_filtered)
    plt.xticks(rotation=90)
    plt.ylabel('Regression Coefficient ($ in thousands)')
    
coefs_salary = best_model_salary['coefs']
coefs_stock = best_model_stock['coefs']
coefs_bonus = best_model_bonus['coefs']

In [None]:
plot_coefs(coefs_salary, coefs_salary > 30)

In [None]:
plot_coefs(coefs_salary, coefs_salary < -30)

In [None]:
coefs_company = coefs_salary.filter(like='company[')
plot_coefs(coefs_company, coefs_company > 25)

In [None]:
plot_coefs(coefs_company, coefs_company < 0)

In [None]:
plot_coefs(coefs_salary.filter(like='SF'))

In [None]:
plot_coefs(coefs_salary.filter(like='NY'))

In [None]:
plot_coefs(coefs_salary.filter(like='tag['))

In [None]:
plot_coefs(coefs_salary.filter(like='Google'))

In [None]:
plot_coefs(coefs_salary.filter(like='company_location['))

In [None]:
plot_coefs(coefs_stock, coefs_stock > 25)

In [None]:
plot_coefs(coefs_stock, coefs_stocks < -25)

In [None]:
plot_coefs(coefs_stock.filter(like='Google'))

In [None]:
plot_coefs(coefs_stock.filter(like='Facebook'))