In [1]:
import pandas as pd
import numpy as np

# Custom utility methods
from src.model_utils import EvaluatePreprocessors, EvaluateEstimators

# Models
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor

# Pipelines and preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

# Model selection and metrics
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn import set_config
set_config(display = 'diagram')

# Import data

In [2]:
train_df = pd.read_csv('../data/processed/training_data.csv', index_col = 0)
target = train_df.salary
train_df.drop(columns = 'salary', inplace=True)

In [3]:
train_df

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
58414,JUNIOR,HIGH_SCHOOL,NONE,WEB,14,84
902618,SENIOR,MASTERS,COMPSCI,SERVICE,24,86
778824,JANITOR,NONE,NONE,WEB,4,86
187593,SENIOR,HIGH_SCHOOL,NONE,WEB,0,42
574438,VICE_PRESIDENT,MASTERS,BUSINESS,AUTO,16,97
...,...,...,...,...,...,...
259179,VICE_PRESIDENT,DOCTORAL,BUSINESS,OIL,9,93
365839,CFO,DOCTORAL,BUSINESS,HEALTH,15,88
131933,SENIOR,MASTERS,COMPSCI,HEALTH,18,42
671158,SENIOR,DOCTORAL,PHYSICS,SERVICE,5,23


# Basic Model - Linear Regression

I want to start modeling with something a little more basic and see how a linear regression measures up to the best baseline score that was found without ML.

**Best baseline score (MSE):** `371.22`

Plan:
1. Encode categorical variables
    - Ordinal encoding for `jobType` and `degree` variables
    - One hot encoding for `industry` and `major` variables
2. Scale numeric variables
    - use StandardScaler for `yearsExperience` and `milesFromMetropolis`
    
Starting out treating `jobType` and `degree` as ordinal. Depending on how the scores measure up, I can try other encodings and preprocessing. 

In [22]:
# Specify ordinal levels for categorical variables
jobtype_ord_levels = ['JANITOR', 'JUNIOR',  'SENIOR', 'MANAGER', 'VICE_PRESIDENT', 'CFO', 'CTO', 'CEO']
degree_ord_levels = ['NONE', 'HIGH_SCHOOL', 'BACHELORS', 'MASTERS', 'DOCTORAL']
industry_ord_levels = ['EDUCATION', 'SERVICE', 'AUTO', 'HEALTH', 'WEB', 'FINANCE', 'OIL']

# Setup ordinal encoder
basic_lr_ordinal_encoder = OrdinalEncoder(categories=[jobtype_ord_levels, degree_ord_levels])

# Preprocessing with ColumnTransformer
basic_lr_preprocessing = ColumnTransformer([
    ('ordinal_encoding', basic_lr_ordinal_encoder, ['jobType', 'degree']),
    ('one_hot_encoding', OneHotEncoder(), ['major', 'industry']),
    ('std_scaler', StandardScaler(), ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')

# Model pipeline
basic_lr_model = Pipeline([
    ('preprocess', basic_lr_preprocessing),
    ('linear regression', LinearRegression())
])
basic_lr_model

In [21]:
# Run cross validation
basic_lr_scores = cross_validate(model_pipe, train_df, target, cv = 5, scoring = 'neg_mean_squared_error', return_train_score = True)

print(f"Mean test score: {np.mean(basic_lr_scores['test_score'])}", end = '\n\n')
basic_lr_scores

Mean test score: -384.40224675594015



{'fit_time': array([1.42585945, 1.39089131, 1.47380996, 1.40587306, 1.40389514]),
 'score_time': array([0.20726514, 0.20675874, 0.22524261, 0.21025825, 0.21125746]),
 'test_score': array([-383.28264578, -386.02038876, -384.77694906, -385.18959446,
        -382.74165573]),
 'train_score': array([-384.65754287, -383.97341664, -384.28455599, -384.18041655,
        -384.79318228])}

This linear regression is fairly close to our best baseline score, but it is still higher. I wonder if changing the encoding of the categorical variables will impact the results in a positive way.

## Test different categorical encodings
Candidates for ordinal encoding:
- jobType
- degree
- industry


`jobType` and `degree` I think are the ones that can be argued to have the most natural precedent for an ordering. 

The different levels of `jobType` can be interpreted as needing different levels of: experience, qualifications, or responsibility on the job. (i.e. especially between the levels 'junior', 'senior', 'manager').

Similarly with `degree` the natural ordering can be interpreted as the number of years of schooling. In this case it makes sense to order the levels as: `'none' < 'high school' < 'bachelors' < 'masters' < 'doctoral'`

As far as `industry` is concerned, it may be valid to argue that there is no real natural ordering between industries (as there is for placement positions of a race, for example). But the average salary of each of the levels of `industry` seem more spread out and cover a wider range of values than `degree`. So I can use the data to give a guess as to the order - might as well try it out.

I don't think it makes sense for `major` to be treated as ordinal here. Because there isn't a clear natural ording to the different majors. Additionally, there is no real data to give a direction at guessing/imposing an order on the levels of `major`. The average salary for the subgroups of `major` are too close together, plus the distributions of the salary for each of the levels of `major` are very much overlapped. And even during EDA we saw that top paying salaries across different industries have different majors.
- I will only treat degree as a nominal variable and use one hot encoding

The preprocessing that performed the best is with one hot encoding all of the categorical features.

#### TODO: make a test for outputting scores for each type of ordinal encoding vs onehot encoding?
- test `jobType` and `industry` as ordinal encoding, the rest one hot
- test each of those as single ordinal, the rest one hot
- test all one hot

include an output table and graphic for porfolio purposes

#### after linear regression
- test out ridge/lasso with the best preprocessing
- try with feature selection with the best of ridge/lasso
- test against SGDRegressor
- the best model from the above test against gradient boosting and tree methods

- hyperperameter tune


### Test EvaluateModel

EvaluatePreprocessors - two different preprocessing pipelines and a linear regression
EvaluateEstimators    - one preprocessing pipeline with linear and ridge regression

In [9]:
all_one_hot_pp = ColumnTransformer([
    ('one_hot_encoding', one_hot_encoder, ['major', 'degree', 'industry', 'jobType']),
    ('std_scaler', scaler, ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')

jobType_ordinal_pp = ColumnTransformer([
    ('ordinal_encoding', OrdinalEncoder(categories = [jobtype_levels]), ['jobType']),
    ('one_hot_encoding', one_hot_encoder, ['major', 'degree', 'industry']),
    ('std_scaler', scaler, ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')

In [10]:
test_preprocessors = [
    ('all_OH', all_one_hot_pp),
    ('jobType_ordinal', jobType_ordinal_pp)
]

test_estimators = [
    ('linreg', LinearRegression()),
    ('ridge', Ridge())
]



In [11]:
# Test EvaluatePreprocessors
eval_pp = EvaluatePreprocessors(preprocessors = test_preprocessors, estimator = LinearRegression(), scoring = 'neg_mean_squared_error')
eval_pp.run(train_df, target, verbose = True)

------------------------------
Finished training: all_OH
Test score  : -384.40224675594015
Train score : -384.3778228669311

------------------------------
Finished training: jobType_ordinal
Test score  : -394.09005815381425
Train score : -394.07396037246053

::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
Best model found:
Pipeline(steps=[('all_OH',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot_encoding',
                                                  OneHotEncoder(),
                                                  ['major', 'degree',
                                                   'industry', 'jobType']),
                                                 ('std_scaler',
                                                  StandardScaler(),
                                                  ['yearsExperience',
                                                   'milesFromMetropolis'])])),
          

Unnamed: 0,test_score,train_score,fit_time,score_time
all_OH,-384.402247,-384.377823,1.290892,0.193774
jobType_ordinal,-394.090058,-394.07396,1.299882,0.209155


In [12]:
# Test EvaluateEstimators
eval_est = EvaluateEstimators(estimators = test_estimators, preprocessing=all_one_hot_pp, scoring = 'neg_mean_squared_error')
eval_est.run(train_df, target, verbose = True)

------------------------------
Finished training: linreg
Test score  : -384.40224675594015
Train score : -384.3778228669311

------------------------------
Finished training: ridge
Test score  : -384.4117328851539
Train score : -384.3874225396973

::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
Best model found:
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot_encoding',
                                                  OneHotEncoder(),
                                                  ['major', 'degree',
                                                   'industry', 'jobType']),
                                                 ('std_scaler',
                                                  StandardScaler(),
                                                  ['yearsExperience',
                                                   'milesFromMetropolis'])])),
               

Unnamed: 0,test_score,train_score,fit_time,score_time
linreg,-384.402247,-384.377823,1.296286,0.198968
ridge,-384.411733,-384.387423,1.151055,0.194772
