In [1]:
import pandas as pd
import numpy as np

from src.model_utils import EvaluatePreprocessors, EvaluateEstimators

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate, GridSearchCV

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler



# Import data

In [2]:
train_df = pd.read_csv('../data/processed/training_data.csv', index_col = 0)
target = train_df.salary
train_df.drop(columns = 'salary', inplace=True)

In [3]:
train_df

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
58414,JUNIOR,HIGH_SCHOOL,NONE,WEB,14,84
902618,SENIOR,MASTERS,COMPSCI,SERVICE,24,86
778824,JANITOR,NONE,NONE,WEB,4,86
187593,SENIOR,HIGH_SCHOOL,NONE,WEB,0,42
574438,VICE_PRESIDENT,MASTERS,BUSINESS,AUTO,16,97
...,...,...,...,...,...,...
259179,VICE_PRESIDENT,DOCTORAL,BUSINESS,OIL,9,93
365839,CFO,DOCTORAL,BUSINESS,HEALTH,15,88
131933,SENIOR,MASTERS,COMPSCI,HEALTH,18,42
671158,SENIOR,DOCTORAL,PHYSICS,SERVICE,5,23


## preprocessing / feature engineering

- drop jobId for training

**ordinal encoding**
- jobType
- degree
- industry

**one hot encoding**
-  major


In [4]:
train_df

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
58414,JUNIOR,HIGH_SCHOOL,NONE,WEB,14,84
902618,SENIOR,MASTERS,COMPSCI,SERVICE,24,86
778824,JANITOR,NONE,NONE,WEB,4,86
187593,SENIOR,HIGH_SCHOOL,NONE,WEB,0,42
574438,VICE_PRESIDENT,MASTERS,BUSINESS,AUTO,16,97
...,...,...,...,...,...,...
259179,VICE_PRESIDENT,DOCTORAL,BUSINESS,OIL,9,93
365839,CFO,DOCTORAL,BUSINESS,HEALTH,15,88
131933,SENIOR,MASTERS,COMPSCI,HEALTH,18,42
671158,SENIOR,DOCTORAL,PHYSICS,SERVICE,5,23


# Linear Regression

In [5]:
jobtype_levels = ['JANITOR', 'JUNIOR',  'SENIOR', 'MANAGER', 'VICE_PRESIDENT', 'CFO', 'CTO', 'CEO']
degree_levels = ['NONE', 'HIGH_SCHOOL', 'BACHELORS', 'MASTERS', 'DOCTORAL']
industry_levels = ['EDUCATION', 'SERVICE', 'AUTO', 'HEALTH', 'WEB', 'FINANCE', 'OIL']

ordinal_encoder = OrdinalEncoder(categories=[jobtype_levels])
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()

preprocessing = ColumnTransformer([
#     ('ordinal_encoding', ordinal_encoder, ['jobType']),
    ('one_hot_encoding', one_hot_encoder, ['major', 'degree', 'industry', 'jobType']),
    ('std_scaler', scaler, ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')

In [6]:
model_pipe = Pipeline([
    ('preprocess', preprocessing),
    ('regression', LinearRegression())
])

In [7]:
cross_validate(model_pipe, train_df, target, cv = 5, scoring = 'neg_mean_squared_error', return_train_score = True)

{'fit_time': array([1.40036464, 1.37239838, 1.34942508, 1.39137578, 1.36540651]),
 'score_time': array([0.21475005, 0.19777012, 0.19976711, 0.20476222, 0.19876814]),
 'test_score': array([-383.28264578, -386.02038876, -384.77694906, -385.18959446,
        -382.74165573]),
 'train_score': array([-384.65754287, -383.97341664, -384.28455599, -384.18041655,
        -384.79318228])}

The preprocessing that performed the best is with one hot encoding all of the categorical features.

#### TODO: make a test for outputting scores for each type of ordinal encoding vs onehot encoding?
- test `jobType` and `industry` as ordinal encoding, the rest one hot
- test each of those as single ordinal, the rest one hot
- test all one hot

include an output table and graphic for porfolio purposes

#### after linear regression
- test out ridge/lasso with the best preprocessing
- try with feature selection with the best of ridge/lasso
- test against SGDRegressor
- the best model from the above test against gradient boosting and tree methods

- hyperperameter tune


### Test EvaluateModel

EvaluatePreprocessors - two different preprocessing pipelines and a linear regression
EvaluateEstimators    - one preprocessing pipeline with linear and ridge regression

In [8]:
all_one_hot_pp = ColumnTransformer([
    ('one_hot_encoding', one_hot_encoder, ['major', 'degree', 'industry', 'jobType']),
    ('std_scaler', scaler, ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')

jobType_ordinal_pp = ColumnTransformer([
    ('ordinal_encoding', OrdinalEncoder(categories = [jobtype_levels]), ['jobType']),
    ('one_hot_encoding', one_hot_encoder, ['major', 'degree', 'industry']),
    ('std_scaler', scaler, ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')

In [9]:
test_preprocessors = [
    ('all_OH', all_one_hot_pp),
    ('jobType_ordinal', jobType_ordinal_pp)
]

test_estimators = [
    ('linreg', LinearRegression()),
    ('ridge', Ridge())
]



In [10]:
# Test EvaluatePreprocessors
eval_pp = EvaluatePreprocessors(preprocessors = test_preprocessors, estimator = LinearRegression(), scoring = 'neg_mean_squared_error')
eval_pp.run(train_df, target, verbose = True)

------------------------------
Finished training: all_OH
Test score  : -384.40224675594015
Train score : -384.3778228669311

------------------------------
Finished training: jobType_ordinal
Test score  : -394.09005815381425
Train score : -394.07396037246053

::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
Best model found:
Pipeline(steps=[('all_OH',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot_encoding',
                                                  OneHotEncoder(),
                                                  ['major', 'degree',
                                                   'industry', 'jobType']),
                                                 ('std_scaler',
                                                  StandardScaler(),
                                                  ['yearsExperience',
                                                   'milesFromMetropolis'])])),
          

Unnamed: 0,test_score,train_score,fit_time
all_OH,-384.402247,-384.377823,1.360612
jobType_ordinal,-394.090058,-394.07396,1.376993


In [11]:
# Test EvaluateEstimators
eval_est = EvaluateEstimators(estimators = test_estimators, preprocessing=all_one_hot_pp, scoring = 'neg_mean_squared_error')
eval_est.run(train_df, target, verbose = True)

------------------------------
Finished training: linreg
Test score  : -384.40224675594015
Train score : -384.3778228669311

------------------------------
Finished training: ridge
Test score  : -384.4117328851539
Train score : -384.3874225396973

::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
Best model found:
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot_encoding',
                                                  OneHotEncoder(),
                                                  ['major', 'degree',
                                                   'industry', 'jobType']),
                                                 ('std_scaler',
                                                  StandardScaler(),
                                                  ['yearsExperience',
                                                   'milesFromMetropolis'])])),
               

Unnamed: 0,test_score,train_score,fit_time
linreg,-384.402247,-384.377823,1.359613
ridge,-384.411733,-384.387423,1.190312
