## Compensation Data Random Exploration

In [24]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn.linear_model as skl_lm
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge, RidgeCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

import statsmodels.api as sm

import seaborn as sns

pd.set_option('display.max_rows', 100)
plt.rcParams["figure.figsize"] = (20,10)

In [22]:
# comp = pd.read_csv('data/prepped_comp_data.csv', index_col='entry_id')
comp = pd.read_csv('data/clean_comp_data.csv', index_col="entry_id")
comp

Unnamed: 0_level_0,total_comp,salary,stock,bonus,company,level,location,years_experience,years_company,tag
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4,578.0,180.0,387.5,9.0,Uber,Senior,"San Francisco, CA",10.0,2.0,
6,173.0,120.0,0.0,53.0,Amazon,L5,"Vancouver, BC, Canada",11.0,1.0,
10,190.0,110.0,80.0,0.0,Amazon,L5,"Seattle, WA",3.0,3.0,
13,156.0,135.0,8.0,13.0,Microsoft,62,"Seattle, WA",4.0,4.0,
16,201.0,157.0,26.0,28.0,Microsoft,63,"Seattle, WA",12.0,6.0,
...,...,...,...,...,...,...,...,...,...,...
30133,115.0,115.0,0.0,0.0,IBM,Advisory Engineer,"Madison, WI",6.0,1.0,Distributed Systems (Back-End)
30139,370.0,187.0,150.0,33.0,Twitter,Senior SWE,"New York, NY",10.0,6.0,iOS
30140,212.0,177.0,0.0,35.0,Bloomberg,Senior Software Engineer,"New York, NY",2.0,0.0,Distributed Systems (Back-End)
30142,185.0,98.0,57.0,30.0,Google,L4,"London, EN, United Kingdom",4.0,2.0,API Development (Back-End)


In [5]:
RANDOM_STATE = 721
X_train, X_test, _, _ = train_test_split(comp, comp, test_size=0.1, random_state=RANDOM_STATE)

y_total_comp = np.exp(X_train.log_total_comp)
y_salary = np.exp(X_train.log_salary)
y_stock = np.exp(X_train.log_stock)
y_bonus = np.exp(X_train.log_bonus)
X = X_train.drop(columns=['log_total_comp', 'log_salary', 'log_stock', 'log_bonus'])

y_test_total_comp = np.exp(X_test.log_total_comp)
y_test_salary = np.exp(X_test.log_salary)
y_test_stock = np.exp(X_test.log_stock)
y_test_bonus = np.exp(X_test.log_bonus)
X_test = X_test.drop(columns=['log_total_comp', 'log_salary', 'log_stock', 'log_bonus'])

X
X_test

Unnamed: 0_level_0,scaled_years_experience,scaled_years_company,company[AT&T],company[Accenture],company[Adobe],company[Airbnb],company[Amazon],company[American Express],company[Andela],company[Apple],...,tag[Full Stack],tag[ML / AI],tag[Mobile (iOS + Android)],tag[Networking],tag[Operating Systems],tag[Security],tag[Site Reliability (SRE)],tag[Testing (SDET)],tag[Web Development (Front-End)],tag[iOS]
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1042,0.343483,0.221278,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6488,-0.393800,0.565848,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21860,-0.762442,-0.123292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7900,-0.209479,0.910418,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4392,-0.393800,0.565848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28116,-0.762442,-0.123292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13550,-0.025158,-0.467863,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8130,0.527804,-0.123292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13210,-0.762442,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,scaled_years_experience,scaled_years_company,company[AT&T],company[Accenture],company[Adobe],company[Airbnb],company[Amazon],company[American Express],company[Andela],company[Apple],...,tag[Full Stack],tag[ML / AI],tag[Mobile (iOS + Android)],tag[Networking],tag[Operating Systems],tag[Security],tag[Site Reliability (SRE)],tag[Testing (SDET)],tag[Web Development (Front-End)],tag[iOS]
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23789,-0.578121,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11706,0.712125,1.254988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
310,-0.578121,-0.467863,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,-1.131083,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5982,0.343483,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25211,-0.578121,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25710,-0.578121,-0.467863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13958,0.896446,-0.812433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4446,0.343483,1.254988,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
mlp = MLPRegressor(hidden_layer_sizes=(1000,), activation='logistic',
                    solver='sgd', tol=1e-4, random_state=721,
                    learning_rate_init=.001, max_iter=2000, verbose=True)
mlp.fit(X, y_salary)

Iteration 1, loss = 1250.08486816
Iteration 2, loss = 808.35344969
Iteration 3, loss = 748.70151739
Iteration 4, loss = 699.81585055
Iteration 5, loss = 651.47939347
Iteration 6, loss = 599.83707474
Iteration 7, loss = 544.58006350
Iteration 8, loss = 489.73669338
Iteration 9, loss = 440.02084968
Iteration 10, loss = 402.21579508
Iteration 11, loss = 375.19617044
Iteration 12, loss = 358.34440871
Iteration 13, loss = 346.92191419
Iteration 14, loss = 340.64862619
Iteration 15, loss = 332.89580036
Iteration 16, loss = 328.77218566
Iteration 17, loss = 324.26292841
Iteration 18, loss = 321.02384763
Iteration 19, loss = 318.17892250
Iteration 20, loss = 315.68408355
Iteration 21, loss = 313.90154953
Iteration 22, loss = 313.02639376
Iteration 23, loss = 310.81566149
Iteration 24, loss = 309.54808634
Iteration 25, loss = 308.44387568
Iteration 26, loss = 306.42885631
Iteration 27, loss = 306.10608928
Iteration 28, loss = 304.52797426
Iteration 29, loss = 303.79179949
Iteration 30, loss = 3

Iteration 239, loss = 227.84855943
Iteration 240, loss = 227.76726564
Iteration 241, loss = 227.34662945
Iteration 242, loss = 227.56502954
Iteration 243, loss = 226.73417565
Iteration 244, loss = 226.25917817
Iteration 245, loss = 226.08361335
Iteration 246, loss = 226.72080511
Iteration 247, loss = 226.46848995
Iteration 248, loss = 225.16746722
Iteration 249, loss = 224.67384444
Iteration 250, loss = 224.21064103
Iteration 251, loss = 224.27738748
Iteration 252, loss = 223.98626267
Iteration 253, loss = 223.62020842
Iteration 254, loss = 223.65758054
Iteration 255, loss = 223.52227940
Iteration 256, loss = 222.84337815
Iteration 257, loss = 222.19444487
Iteration 258, loss = 222.02730220
Iteration 259, loss = 221.45058923
Iteration 260, loss = 221.36979839
Iteration 261, loss = 220.22863384
Iteration 262, loss = 220.55767644
Iteration 263, loss = 220.01335521
Iteration 264, loss = 219.89266705
Iteration 265, loss = 219.72457483
Iteration 266, loss = 218.70993702
Iteration 267, loss 

Iteration 474, loss = 157.16998725
Iteration 475, loss = 156.72240755
Iteration 476, loss = 156.48791041
Iteration 477, loss = 155.93924724
Iteration 478, loss = 155.44254365
Iteration 479, loss = 156.37048193
Iteration 480, loss = 155.60909303
Iteration 481, loss = 155.38845738
Iteration 482, loss = 155.07320842
Iteration 483, loss = 154.59689610
Iteration 484, loss = 155.00244599
Iteration 485, loss = 154.25201808
Iteration 486, loss = 154.13460673
Iteration 487, loss = 154.21206074
Iteration 488, loss = 153.41465484
Iteration 489, loss = 153.52852772
Iteration 490, loss = 153.62312509
Iteration 491, loss = 153.02261876
Iteration 492, loss = 152.80701441
Iteration 493, loss = 152.82305371
Iteration 494, loss = 152.38719852
Iteration 495, loss = 151.94132028
Iteration 496, loss = 152.19003237
Iteration 497, loss = 151.51379520
Iteration 498, loss = 151.36407342
Iteration 499, loss = 151.22566397
Iteration 500, loss = 151.32800962
Iteration 501, loss = 150.88525239
Iteration 502, loss 

Iteration 709, loss = 117.67316307
Iteration 710, loss = 117.81598117
Iteration 711, loss = 117.39846622
Iteration 712, loss = 117.70059590
Iteration 713, loss = 117.41836501
Iteration 714, loss = 116.99460864
Iteration 715, loss = 116.82776306
Iteration 716, loss = 117.16568438
Iteration 717, loss = 116.95045634
Iteration 718, loss = 117.17554029
Iteration 719, loss = 116.72287285
Iteration 720, loss = 116.68183944
Iteration 721, loss = 116.01328214
Iteration 722, loss = 116.21897531
Iteration 723, loss = 115.93900332
Iteration 724, loss = 115.96296716
Iteration 725, loss = 115.81930884
Iteration 726, loss = 115.98348449
Iteration 727, loss = 115.95977120
Iteration 728, loss = 115.37329909
Iteration 729, loss = 116.02125010
Iteration 730, loss = 115.24627147
Iteration 731, loss = 115.51464826
Iteration 732, loss = 115.00741516
Iteration 733, loss = 115.09521712
Iteration 734, loss = 114.67755761
Iteration 735, loss = 114.84374308
Iteration 736, loss = 114.70567696
Iteration 737, loss 

Iteration 945, loss = 97.29543641
Iteration 946, loss = 97.23765200
Iteration 947, loss = 97.44859409
Iteration 948, loss = 97.23081123
Iteration 949, loss = 97.08238565
Iteration 950, loss = 96.81593497
Iteration 951, loss = 96.87283110
Iteration 952, loss = 96.58792827
Iteration 953, loss = 96.44305049
Iteration 954, loss = 96.70974434
Iteration 955, loss = 96.40230523
Iteration 956, loss = 96.37071417
Iteration 957, loss = 96.32611462
Iteration 958, loss = 96.57005417
Iteration 959, loss = 96.36358389
Iteration 960, loss = 96.26233282
Iteration 961, loss = 96.43904853
Iteration 962, loss = 96.09639597
Iteration 963, loss = 96.18190351
Iteration 964, loss = 96.03173846
Iteration 965, loss = 96.00410809
Iteration 966, loss = 95.54109623
Iteration 967, loss = 95.86034823
Iteration 968, loss = 95.59831348
Iteration 969, loss = 95.74060494
Iteration 970, loss = 95.48907198
Iteration 971, loss = 95.47670298
Iteration 972, loss = 95.68359046
Iteration 973, loss = 95.30115480
Iteration 974,

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(1000,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=2000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=721, shuffle=True, solver='sgd',
             tol=0.0001, validation_fraction=0.1, verbose=True,
             warm_start=False)

In [18]:
mlp2 = MLPRegressor(hidden_layer_sizes=(700,), activation='logistic',
                    solver='sgd', tol=1e-4, random_state=721,
                    learning_rate_init=.005, max_iter=2000, verbose=True)
mlp2.fit(X, y_salary)

Iteration 1, loss = 990.16060146
Iteration 2, loss = 478.11101452
Iteration 3, loss = 399.41030180
Iteration 4, loss = 366.41076284
Iteration 5, loss = 345.63608810
Iteration 6, loss = 333.14138800
Iteration 7, loss = 325.64025704
Iteration 8, loss = 318.41680508
Iteration 9, loss = 312.33435672
Iteration 10, loss = 308.99825048
Iteration 11, loss = 304.38175143
Iteration 12, loss = 299.94085352
Iteration 13, loss = 296.12747721
Iteration 14, loss = 293.67434751
Iteration 15, loss = 292.26068019
Iteration 16, loss = 289.90092721
Iteration 17, loss = 287.95597304
Iteration 18, loss = 285.16725271
Iteration 19, loss = 284.03598822
Iteration 20, loss = 281.30169036
Iteration 21, loss = 280.57414679
Iteration 22, loss = 278.55350647
Iteration 23, loss = 275.56852561
Iteration 24, loss = 276.28024969
Iteration 25, loss = 274.93894782
Iteration 26, loss = 272.08565497
Iteration 27, loss = 270.52318499
Iteration 28, loss = 271.16623882
Iteration 29, loss = 270.23451436
Iteration 30, loss = 26

Iteration 239, loss = 122.70165381
Iteration 240, loss = 123.39664037
Iteration 241, loss = 122.64559102
Iteration 242, loss = 122.84361651
Iteration 243, loss = 122.00148145
Iteration 244, loss = 121.23176908
Iteration 245, loss = 121.10177825
Iteration 246, loss = 121.01487867
Iteration 247, loss = 121.69792823
Iteration 248, loss = 121.65781963
Iteration 249, loss = 121.18057106
Iteration 250, loss = 120.29084372
Iteration 251, loss = 120.25359947
Iteration 252, loss = 120.17095015
Iteration 253, loss = 120.05686341
Iteration 254, loss = 119.86969453
Iteration 255, loss = 119.81138421
Iteration 256, loss = 118.98979496
Iteration 257, loss = 118.81423162
Iteration 258, loss = 119.65394163
Iteration 259, loss = 118.26399565
Iteration 260, loss = 118.48771910
Iteration 261, loss = 118.41824387
Iteration 262, loss = 117.60773844
Iteration 263, loss = 117.94913453
Iteration 264, loss = 118.02047768
Iteration 265, loss = 116.62084003
Iteration 266, loss = 116.80401783
Iteration 267, loss 

Iteration 476, loss = 89.74121083
Iteration 477, loss = 89.83211943
Iteration 478, loss = 89.69536725
Iteration 479, loss = 89.62784841
Iteration 480, loss = 89.69133373
Iteration 481, loss = 89.10234104
Iteration 482, loss = 89.24170598
Iteration 483, loss = 89.11437264
Iteration 484, loss = 89.28357799
Iteration 485, loss = 89.06092888
Iteration 486, loss = 89.11092929
Iteration 487, loss = 88.65946363
Iteration 488, loss = 88.88580047
Iteration 489, loss = 88.86780103
Iteration 490, loss = 88.43272725
Iteration 491, loss = 88.83697414
Iteration 492, loss = 88.84764420
Iteration 493, loss = 88.41225846
Iteration 494, loss = 87.86065575
Iteration 495, loss = 88.71089075
Iteration 496, loss = 88.36619379
Iteration 497, loss = 88.51527481
Iteration 498, loss = 88.91453718
Iteration 499, loss = 88.03726277
Iteration 500, loss = 88.48258559
Iteration 501, loss = 88.15681208
Iteration 502, loss = 87.67343386
Iteration 503, loss = 88.21928630
Iteration 504, loss = 87.29288565
Iteration 505,

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(700,), learning_rate='constant',
             learning_rate_init=0.005, max_fun=15000, max_iter=2000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=721, shuffle=True, solver='sgd',
             tol=0.0001, validation_fraction=0.1, verbose=True,
             warm_start=False)

In [21]:
y_train_pred = mlp2.predict(X_train).flatten()
y_train_pred
test_error = np.sqrt(mean_squared_error(y_salary, y_train_pred))
test_error
# y_pred = mlp.predict(X_test).flatten()
# test_error = np.sqrt(mean_squared_error(y_test_salary, y_pred))
# test_error

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 191 is different from 195)

In [12]:
mlpr = MLPRegressor(max_iter=7000, tol=1e-4, verbose=True)

param_list = {
    "hidden_layer_sizes": [(50,), (200,), (500,)],
    #"activation": ["identity", "logistic", "tanh", "relu"],
    #"solver": ["lbfgs", "sgd", "adam"],
    "alpha": [0.00005, 0.0005],
}

gridCV = GridSearchCV(estimator=mlpr, param_grid=param_list, n_jobs=-1, verbose=10)

gridCV.fit(X, y_salary)

# splits = TimeSeriesSplit(n_splits=3)

# pyplot.figure(1)
# index = 1

# for train_index, test_index in splits.split(scaled_dataset):

#     training_set = scaled_dataset[train_index]
#     testing_set = scaled_dataset[test_index]

#     train_index_array = train_index.reshape(-1,1)
#     test_index_array = test_index.reshape(-1,1)

#     gridCV.fit(train_index_array, training_set)
#     predicted = gridCV.predict(test_index_array)
#     parameters = mlpr.get_params()

#     test_mse = mean_squared_error(testing_set, predicted)

#     pyplot.subplot(310 + index)
#     pyplot.plot(predicted)
#     pyplot.plot([None for i in training_set] + [x for x in testing_set])
#     index += 1

#     train_index.flatten() 
#     test_index.flatten() 

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  30 | elapsed:  2.9min remaining: 26.1min
[Parallel(n_jobs=-1)]: Done   7 out of  30 | elapsed:  2.9min remaining:  9.5min
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed:  2.9min remaining:  5.0min
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  2.9min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:  2.9min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  2.9min remaining:   53.0s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  2.9min remaining:   19.4s


KeyboardInterrupt: 