In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Custom utility methods
from src.EvaluateModels import EvaluatePreprocessors, EvaluateEstimators, EvaluatePipelines
from src.model_utils import make_categorical_encoding

# Models
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor


# Pipelines and preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

# Model selection and metrics
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn import set_config
set_config(display = 'diagram')


# Dev
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import data

In [2]:
train_df = pd.read_csv('../data/processed/training_data.csv', index_col = 0)
target = train_df.salary
train_df.drop(columns = 'salary', inplace=True)

In [3]:
train_df

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
58414,JUNIOR,HIGH_SCHOOL,NONE,WEB,14,84
902618,SENIOR,MASTERS,COMPSCI,SERVICE,24,86
778824,JANITOR,NONE,NONE,WEB,4,86
187593,SENIOR,HIGH_SCHOOL,NONE,WEB,0,42
574438,VICE_PRESIDENT,MASTERS,BUSINESS,AUTO,16,97
...,...,...,...,...,...,...
259179,VICE_PRESIDENT,DOCTORAL,BUSINESS,OIL,9,93
365839,CFO,DOCTORAL,BUSINESS,HEALTH,15,88
131933,SENIOR,MASTERS,COMPSCI,HEALTH,18,42
671158,SENIOR,DOCTORAL,PHYSICS,SERVICE,5,23


## Variables

In [4]:
# Specify ordinal levels for categorical variables
jobtype_ord_levels = ['JANITOR', 'JUNIOR',  'SENIOR', 'MANAGER', 'VICE_PRESIDENT', 'CFO', 'CTO', 'CEO']
degree_ord_levels = ['NONE', 'HIGH_SCHOOL', 'BACHELORS', 'MASTERS', 'DOCTORAL']
# industry levels ordered from lowest average salary to highest
industry_ord_levels = ['EDUCATION', 'SERVICE', 'AUTO', 'HEALTH', 'WEB', 'FINANCE', 'OIL']

# Scoring metric for modeling
SCORING_METRIC = 'neg_mean_squared_error'

# Basic Model - Linear Regression

I want to start modeling with something a little more basic and see how a linear regression measures up to the best baseline score that was found without ML.

**Best baseline score (MSE):** `371.22`

Plan:
1. Encode categorical variables
    - Ordinal encoding for `jobType` and `degree` variables
    - One hot encoding for `industry` and `major` variables
2. Scale numeric variables
    - use StandardScaler for `yearsExperience` and `milesFromMetropolis`
    
Starting out treating `jobType` and `degree` as ordinal. Depending on how the scores measure up, I can try other encodings and preprocessing. 

In [5]:
# Preprocessing with ColumnTransformer
basic_lr_preprocessing = ColumnTransformer([
    ('ordinal_encoding', OrdinalEncoder(categories=[jobtype_ord_levels, degree_ord_levels]), ['jobType', 'degree']),
    ('one_hot_encoding', OneHotEncoder(), ['major', 'industry']),
    ('std_scaler', StandardScaler(), ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')

# Model pipeline
basic_lr_model = Pipeline([
    ('preprocess', basic_lr_preprocessing),
    ('linear regression', LinearRegression())
])
basic_lr_model

In [22]:
# Run cross validation
basic_lr_scores = cross_validate(basic_lr_model, train_df, target, cv = 5, scoring = SCORING_METRIC, return_train_score = True)

print(f"Mean test score: {np.mean(basic_lr_scores['test_score'])}", end = '\n\n')
basic_lr_scores

Mean test score: -394.38290117364784



{'fit_time': array([1.46032476, 1.46531892, 1.45282078, 1.46182418, 1.43984365]),
 'score_time': array([0.21175289, 0.1837852 , 0.18528795, 0.18529058, 0.18878269]),
 'test_score': array([-393.30608198, -396.3105107 , -394.56362714, -395.18530224,
        -392.54898381]),
 'train_score': array([-394.63886445, -393.88634722, -394.32206648, -394.16714741,
        -394.82603238])}

This linear regression's error is quite a bit higher than our best baseline score. I wonder if changing the encoding of the categorical variables will impact the results in a positive way.

## Test different categorical encodings

Plan:  
Test modeling different variables as ordinal encoding, and also test all one hot encoding

Candidates for ordinal encoding:
- jobType
- degree
- industry

`jobType` and `degree` I think are the ones that can be argued to have the most natural precedent for an ordering. 
- The different levels of `jobType` can be interpreted as needing different levels of: experience, qualifications, or responsibility on the job. (i.e. especially between the levels 'junior', 'senior', 'manager').
- Similarly with `degree` the natural ordering can be interpreted as the number of years of schooling. In this case it makes sense to order the levels as: `'none' < 'high school' < 'bachelors' < 'masters' < 'doctoral'`

As far as `industry` is concerned, it may be valid to argue that there is no real natural ordering between industries; can you really rank health versus finance industries? But the average salary of each of the levels of `industry` seem more spread out and cover a wider range of values than `degree`. So I can use the data to give a guess as to the order - might as well try it out.

I don't think it makes sense for `major` to be treated as ordinal here. Because there isn't a clear natural ording to the different majors. Additionally, there is no real data to give a direction at guessing/imposing an order on the levels of `major`. The average salary for the subgroups of `major` are too close together, plus the distributions of the salary for each of the levels of `major` are very much overlapped. And even during EDA we saw that top paying salaries across different industries have different majors, so it is hard to think of an ordering.
- I will only treat `major` as a nominal variable and use one hot encoding


**Preprocessing pipelines to test:**
1. single ordinal encoding  
    a. jobType  
    b. industry  
    c. degree  
2. double ordinal encoding  
    a. jobType & industry  
    b. industry & degree  
3. all 3 ordinal encoding
4. all one hot
5. first model (degree & jobType as ordinal)

In [37]:
# Keep a dictionary so that I can easily refer back to these column transformers later if necessary
# But I will also make a list of tuples to be fed into the evaluator classes 
categorical_encoding_pipelines = {
    # single ordinal encoding
    'jobType_ordinal': make_categorical_encoding(category_levels = [jobtype_ord_levels],
                                                 ord_cols = ['jobType'],
                                                 oh_cols = ['industry', 'degree', 'major']),
    'industry_ordinal': make_categorical_encoding(category_levels = [industry_ord_levels],
                                                  ord_cols = ['industry'],
                                                  oh_cols = ['jobType', 'degree', 'major']),
    'degree_ordinal': make_categorical_encoding(category_levels = [degree_ord_levels],
                                                ord_cols = ['degree'],
                                                oh_cols = ['industry', 'jobType', 'major']),
    
    # double ordinal encoding
    'jobType-industry_ordinal': make_categorical_encoding(category_levels = [jobtype_ord_levels, industry_ord_levels],
                                                          ord_cols = ['jobType', 'industry'],
                                                          oh_cols = ['degree', 'major']),
    'industry-degree_ordinal': make_categorical_encoding(category_levels = [industry_ord_levels, degree_ord_levels],
                                                         ord_cols = ['industry', 'degree'],
                                                         oh_cols = ['jobType', 'major']),
    
    # All 3 ordinal 
    'all_ordinal': make_categorical_encoding(category_levels = [jobtype_ord_levels, industry_ord_levels, degree_ord_levels],
                                             ord_cols = ['jobType', 'industry', 'degree'],
                                             oh_cols = ['major']),
    
    # All one hot
    'all_one_hot': make_categorical_encoding(category_levels = None, ord_cols = None,
                                             oh_cols = ['jobType', 'degree', 'industry', 'major']),
    
    # First model - basic lr (jobType & degree ordinal) 
    'basic_lr_(first model)': basic_lr_preprocessing
    
}

# list of test arrays
categorical_encoding_list = [(name, transformer) for name, transformer in categorical_encoding_pipelines.items()]

In [8]:
find_best_categorical_encoding = EvaluatePreprocessors(preprocessors = categorical_encoding_list,
                                                       estimator = LinearRegression(),
                                                       scoring = SCORING_METRIC)

find_best_categorical_encoding.run(train_df, target)

::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
Best model found:
Pipeline(steps=[('all_one_hot',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot_encoding',
                                                  OneHotEncoder(),
                                                  ['jobType', 'degree',
                                                   'industry', 'major']),
                                                 ('std_scaler',
                                                  StandardScaler(),
                                                  ['yearsExperience',
                                                   'milesFromMetropolis'])])),
                ('estimator', LinearRegression())])

Model score (using 'neg_mean_squared_error')
-384.40224675582164



Unnamed: 0,test_score,train_score,fit_time,score_time
all_one_hot,-384.402247,-384.377823,1.412372,0.209958
degree_ordinal,-384.580781,-384.557766,1.410672,0.221447
industry_ordinal,-386.794762,-386.774068,1.485694,0.21585
industry-degree_ordinal,-386.973211,-386.953954,1.442635,0.215551
jobType_ordinal,-394.090058,-394.07396,1.441339,0.222244
basic_lr_(first model),-394.382901,-394.368092,1.51746,0.200868
jobType-industry_ordinal,-396.491995,-396.478983,1.471216,0.208557
all_ordinal,-396.784029,-396.772258,1.293513,0.19038


The categorical encoding that worked the best is using one hot for all the variables, with a score of `384.40`. But not quite as low as the baseline score.

#### Next models to test 

SGDRegressor, randomforrest, xgboost

In [None]:
test_xgb = Pipeline([
    ('categorical_encoding', categorical_encoding_pipelines['all_one_hot']),
    ('xbg', XGBRegressor())
])

test_rf = Pipeline([
    ('categorical_encoding', categorical_encoding_pipelines['all_one_hot']),
    ('rf', RandomForestRegressor(n_jobs = -1))
])

cross_validate(test_rf, train_df, target, return_train_score = True, scoring = SCORING_METRIC)

#### after linear regression
- test out ridge/lasso with the best preprocessing
- try with feature selection with the best of ridge/lasso
- test against SGDRegressor
- the best model from the above test against gradient boosting and tree methods

- hyperperameter tune
