# Model improvement

The best model, using xgboost and no feature engineering, has been deployed with our web app. 

But thinking about the best baseline model performance and the calculations it used, has given some inspiration to engineer new features.

#### Plan

Implement a function transformer to fit in our sklearn pipeline.
    - Perform the same calculations used during baseline model fitting, and add those new features to the dataset

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from src.eda_utils import salary_per_category_table

from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from sklearn.base import BaseEstimator, TransformerMixin

# Load data and best model

In [2]:
train_df = pd.read_csv('../data/processed/training_data.csv', index_col = 0)
target = train_df.salary
train_df.drop(columns = 'salary', inplace=True)

with open('../models/salary_prediction_xgboost_v1.pkl', 'rb') as file:
    best_model = pickle.load(file)

In [6]:
cross_validate(best_model, train_df, target, return_train_score=True, scoring='neg_mean_squared_error')

{'fit_time': array([10.63006973, 10.67477298, 10.49173927, 10.50003529, 12.06633186]),
 'score_time': array([0.40099883, 0.4429996 , 0.40099931, 0.45800042, 0.454     ]),
 'test_score': array([-354.24281216, -356.5772109 , -355.26574855, -356.41306827,
        -353.69040894]),
 'train_score': array([-352.63938146, -351.9898229 , -352.15599984, -352.28042562,
        -352.54611842])}

In [174]:
ordinal_no_scaling_pp = ColumnTransformer([('ordinal encoding', OrdinalEncoder(), ['jobType', 'degree', 'industry', 'major'])], remainder = 'passthrough')
ordinal_no_scaling_pp

ColumnTransformer(remainder='passthrough',
                  transformers=[('ordinal encoding', OrdinalEncoder(),
                                 ['jobType', 'degree', 'industry', 'major'])])

In [258]:
class GroupedAverages(BaseEstimator, TransformerMixin):
    def __init__(self, category_vars, numeric_vars=[], add_diff = True, extra_stats = True, target_col = 'salary'):
        self.category_vars = category_vars
        self.numeric_vars = numeric_vars
        self.add_diff = add_diff
        self.extra_stats = extra_stats
        self.target_col = target_col
        
        if self.numeric_vars:
            self.fitted_numeric_values = {column:None for column in self.numeric_vars}
        
    def fit(self, X, y):
        data = X.join(y)
        
        if self.extra_stats:
            self.averages = (
                data
                .groupby(self.category_vars)[self.target_col]
                .agg(['mean', 'median', 'min', 'max', 'std'])
                .rename(lambda x: f'group_{x}', axis = 1)
            )
            
        else:
            self.averages = salary_per_category_table(self.category_vars, data).set_index(self.category_vars)
            self.averages.rename(columns={self.target_col: 'grouped_avg'}, inplace=True)
        
        if self.numeric_vars:
            overall_average = y.mean()
            for col in self.fitted_numeric_values.keys():
                diff_from_average = data.groupby(col)[self.target_col].mean() - overall_average
                self.fitted_numeric_values[col] = diff_from_average.rename(f'{col}_diff')
        
        return self

    def transform(self, X):
        data = X.copy()
        
        data = data.join(self.averages, on = self.category_vars)
        
        if self.numeric_vars:
            for col in self.fitted_numeric_values.keys():
                data = data.join(self.fitted_numeric_values[col], on = col)
                
        if self.add_diff:
            diff_cols = [col for col in data.columns if col.endswith('_diff')]
            data['combined_diff'] = data[diff_cols].sum(axis = 1)
        
        # debug drop all diff cols to see if i can get under 320
        drop_these = [col for col in data.columns if col.endswith('_diff')]
        data.drop(columns=drop_these, inplace=True)
        
        return data

In [259]:
testClass = GroupedAverages(['jobType', 'industry', 'degree', 'major'], numeric_vars=['yearsExperience', 'milesFromMetropolis'])

In [260]:
testClass.fit_transform(train_df, target)

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,group_mean,group_median,group_min,group_max,group_std
58414,JUNIOR,HIGH_SCHOOL,NONE,WEB,14,84,90.407844,87.0,39,182,23.428463
902618,SENIOR,MASTERS,COMPSCI,SERVICE,24,86,107.146907,105.0,55,175,25.075113
778824,JANITOR,NONE,NONE,WEB,4,86,75.117009,71.0,32,169,22.486631
187593,SENIOR,HIGH_SCHOOL,NONE,WEB,0,42,100.592676,97.0,50,198,24.382293
574438,VICE_PRESIDENT,MASTERS,BUSINESS,AUTO,16,97,127.309045,127.0,69,220,24.930583
...,...,...,...,...,...,...,...,...,...,...,...
259179,VICE_PRESIDENT,DOCTORAL,BUSINESS,OIL,9,93,160.070218,157.0,94,260,31.318012
365839,CFO,DOCTORAL,BUSINESS,HEALTH,15,88,146.112219,143.0,81,240,28.686144
131933,SENIOR,MASTERS,COMPSCI,HEALTH,18,42,113.183938,109.0,64,207,25.018951
671158,SENIOR,DOCTORAL,PHYSICS,SERVICE,5,23,105.412621,102.0,60,199,24.822575


In [265]:
test_pipeline = Pipeline(
    [
        ('feature_engineer', GroupedAverages(['jobType', 'industry', 'degree', 'major'], numeric_vars=['yearsExperience', 'milesFromMetropolis'])),
        ('categorical_encoding', ordinal_no_scaling_pp),
        ('gbr', GradientBoostingRegressor(n_estimators=40, max_depth=7))
    ]
)

In [256]:
test_pipeline.set_params(**{'xgb__gamma': 0.0001, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__min_child_weight': 200, 'xgb__n_estimators': 275})

Pipeline(steps=[('feature_engineer',
                 GroupedAverages(category_vars=['jobType', 'industry', 'degree',
                                                'major'],
                                 numeric_vars=['yearsExperience',
                                               'milesFromMetropolis'])),
                ('categorical_encoding',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinal encoding',
                                                  OrdinalEncoder(),
                                                  ['jobType', 'degree',
                                                   'industry', 'major'])])),
                ('xgb',
                 XGBRegressor(base_sc...
                              importance_type='gain',
                              interaction_constraints=None, learning_rate=0.1,
                              max_delta_step=None, max_depth=6,
                              min_c

In [266]:
cross_validate(test_pipeline, train_df, target, scoring='neg_mean_squared_error', return_train_score=True)

{'fit_time': array([114.14201069, 117.21793866, 114.68600011, 116.53899026,
        114.57807064]),
 'score_time': array([0.65200233, 0.65799832, 0.65300155, 0.65400362, 0.66200542]),
 'test_score': array([-355.54642579, -358.20509329, -356.89621123, -357.5305354 ,
        -355.57838076]),
 'train_score': array([-351.96536259, -351.3042918 , -351.54127866, -351.38599988,
        -351.93647829])}

In [2]:
test_score = np.mean([-355.54642579, -358.20509329, -356.89621123, -357.5305354, -355.57838076])
print(f'Mean test score = {test_score}')

Mean test score = -356.751329294
