Temporary notebook to check whether preprocessors and models can be saved and re-loaded.

In [16]:
import pandas as pd
import numpy as np
import re
import pickle
from joblib import dump, load

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

from feature_eng import *
from drop import *
from create_apply_imputer import *

In [17]:
df = pd.read_csv('../data/train_data.zip')

In [18]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [28]:
X_train.columns.to_list() == X_valid.columns.to_list()

True

Check to see if a fit `ColumnTransformer` can be saved and re-loaded:

In [29]:
def create_save_transformer(X_train, filename='imputer.joblib'):

    """
    Fit a transformer on `X_train` and save it in
    .joblib format
    
    Parameters
    ----------
    X_train: pd.DataFrame
        Training set
        
    filename: str
        Filename to use to save transformer
    
    """
    
    #======================================
    # IMPORT DATA FRAME
    #======================================

    df = pd.read_csv('../data/train_data.zip')

    #======================================
    # IDENTIFY COLUMNS TO IMPUTE
    #======================================

    # Impute with 0
    monthly_count_equipment = df.loc[:, 'monthly_count_slide_single':'monthly_count_climber'].columns.to_list()
    historic_session = df.loc[:, 'historic_number_of_sessions':'historic_avg_mod_plus_vig'].columns.to_list()
    historic_hour = df.loc[:, 'historic_hour_0':'historic_hour_23'].columns.to_list()
    historic_count_equipment = df.loc[:, 'historic_count_bridge':'historic_count_zipline'].columns.to_list()
    historic_weather = df.loc[:, 'historic_cloudy':'historic_snow'].columns.to_list()
    OSM = df.loc[:, 'n': 'streets_per_node_proportion_7_osid'].columns.to_list()
    zero_misc = ['days_since_first_sess', 'perfect_days', 'Green_2016', 'Number_of_holidays']

    zero_imp_features = monthly_count_equipment + historic_session + historic_hour \
                        + historic_count_equipment + historic_weather + OSM + zero_misc

    # Impute with mean
    weather = df.loc[:, 'weather_clear':'avg_wind_12_above'].columns.to_list()
    mean_misc = ['walk_score', 'bike_score', 'Poor_physical_health_days', 'Poor_mental_health_days', 'Adult_smoking']

    mean_imp_features = weather + mean_misc

    #======================================
    # CREATE TRANSFORMERS
    #======================================

    # Create transformer for 0 imputation
    zero_transformer = SimpleImputer(strategy='constant', fill_value=0)

    # Create transformer for mean imputation
    mean_transformer = SimpleImputer(strategy='mean')

    # Create transformer for `Republicans_08_Votes`
    rep_08_votes_transformer = SimpleImputer(strategy='constant', fill_value=193841)

    # Create transformer for `Democrats_08_Votes`
    dem_08_votes_transformer = SimpleImputer(strategy='constant', fill_value=123594)

    # Create transformer for `Republican_12_Votes`
    rep_12_votes_transformer = SimpleImputer(strategy='constant', fill_value=164676)

    # Create transformer for `Democrats_12_Votes`
    dem_12_votes_transformer = SimpleImputer(strategy='constant', fill_value=122640)

    # Create transformer for `Republicans_2016`
    rep_2016_transformer = SimpleImputer(strategy='constant', fill_value=163387)

    # Create transformer for `Democrats_2016`
    dem_2016_transformer = SimpleImputer(strategy='constant', fill_value=116454)

    # Create transformer for `Libertarians_2016`
    lib_2016_transformer = SimpleImputer(strategy='constant', fill_value=18725)

    #======================================
    # PUT IT ALL TOGETHER
    #======================================

    imputer = ColumnTransformer(
        transformers=[
            ('zero', zero_transformer, zero_imp_features),
            ('mean', mean_transformer, mean_imp_features),
            ('rep_08_votes', rep_08_votes_transformer, ['Republican_08_Votes']),
            ('dem_08_votes', dem_08_votes_transformer, ['Democrats_08_Votes']),
            ('rep_12_votes', rep_12_votes_transformer, ['Republican_12_Votes']),
            ('dem_12_votes', dem_12_votes_transformer, ['Democrats_12_Votes']),
            ('rep_2016', rep_2016_transformer, ['Republicans_2016']),
            ('dem_2016', dem_2016_transformer, ['Democrats_2016']),
            ('lib_2016', lib_2016_transformer, ['Libertarians_2016'])
        ],
        remainder='passthrough'
    ) 

    # Check that unspecified columns are passed through
    assert imputer.remainder == 'passthrough'
        
    # Check that the output is comprised of 9 transformers
    assert len(imputer.transformers) == 9

    #======================================
    # SAVE IMPUTER FOR FUTURE USE
    #======================================
    
    # Fit the column transformer on X_train
    imputer = imputer.fit(X_train)

    # Save the imputer
    # pickled = pickle.dumps(imputer)
    pickled_imputer = dump(imputer, filename)
    
    return None

In [30]:
# Check that saving and loading works
create_save_transformer(X_train)

imputer = load('imputer.joblib')
type(imputer)

sklearn.compose._column_transformer.ColumnTransformer

In [31]:
test_X_train = imputer.transform(X_train)
test_X_valid = imputer.transform(X_valid)

In [32]:
test_X_train

array([[0.0, 0.0, 0.0, ..., 0.2340273398691753, 0.04053192196513618,
        0.1934954179040392],
       [0.0, 0.0, 0.0, ..., 0.08288097777002484, 0.02242467383750538,
        0.06045630393251946],
       [0.0, 0.0, 0.0, ..., 0.13293507506486346, 0.025297853190052573,
        0.10763722187481088],
       ...,
       [0.0, 0.0, 0.0, ..., 0.1203213862874529, 0.010353760723929736,
        0.10996762556352316],
       [0.0, 0.0, 0.0, ..., 0.10379939989472503, 0.016168447536394917,
        0.08763095235833011],
       [0.0, 0.0, 0.0, ..., 0.0947045465862796, 0.008126445714095074,
        0.08657810087218455]], dtype=object)

In [33]:
test_X_valid

array([[0.0, 0.0, 0.0, ..., 0.15797098578774466, 0.031497880607090335,
        0.12647310518065427],
       [0.0, 0.0, 0.0, ..., 0.0734567783353103, 0.02840962017927957,
        0.04504715815603074],
       [0.0, 0.0, 0.0, ..., 0.15747500204282536, 0.009782712550551279,
        0.14769228949227406],
       ...,
       [0.0, 0.0, 0.0, ..., 0.10605070295206934, 0.011513615543877859,
        0.09453708740819147],
       [7.0, 5.0, 3.0, ..., 0.1120380067164298, 0.006893412487466378,
        0.1051445942289634],
       [0.0, 0.0, 0.0, ..., 0.13220730170316675, 0.018518862038046043,
        0.11368843966512072]], dtype=object)

In [34]:
def impute_data(X, filename='imputer.joblib'):
    """
    Load a transformer fit on `X_train`.
    Return the imputed dataframe.

    Parameters
    ----------
    X: pd.DataFrame
        `X_train`, `X_valid` or `X_test`
    
    filename: str
        Filename of fitted imputer 
    
    Returns
    -------
    pd.DataFrame
    
    """
    
    # load in transformer that's fit on `X_train`
    imputer = load(filename)
    
    # Transform data frame accordingly
    imputed_X = imputer.transform(X)
        
    cols = []
    
    # Grab column names of imputed features
    for i in range(len(imputer.transformers_) - 1):
        cols += imputer.transformers_[i][2]
    
    # Grab column names of features that were passed through unchanged
    cols += [X.columns[i] for i in imputer.transformers_[-1][2]]
    
    # Grab old order of columns
    old_cols = X.columns.to_list()
    
    # Reshuffle column order of new dataframes to match old one
    imputed_X = pd.DataFrame(imputed_X, index=X.index, columns=cols).reindex(columns=old_cols)
    
    # Cast each pandas object to its previous dtype
    types = X.dtypes.to_dict()
    imputed_X = imputed_X.astype(types)
    
    # Check that the number of rows is unchanged
    assert imputed_X.shape[0] == X.shape[0]
    
    # Check that the first column of `X_train` is `external_id`
    assert imputed_X.columns[0] == 'external_id'
    
    return imputed_X

In [35]:
imputed_X_train = impute_data(X_train)
imputed_X_valid = impute_data(X_valid)

In [36]:
imputed_X_train.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_8_9,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
49288,FM00168297,2,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,211.42,0.234027,0.040532,0.193495
47981,FM00160322,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,239.22,0.082881,0.022425,0.060456
506,MR00700681,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,151.43,0.132935,0.025298,0.107637
27843,1806280,7,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,296.13,0.271691,0.015658,0.256033
30294,MR00112552,10,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,258.93,0.082201,0.024017,0.058184


In [37]:
imputed_X_valid.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_8_9,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
45913,FM00167636,5,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,694.59,0.157971,0.031498,0.126473
5319,FM00161313,9,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,307.3,0.073457,0.02841,0.045047
42344,1806831,9,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,177.48,0.157475,0.009783,0.147692
22466,1900178,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,231.03,0.082201,0.024017,0.058184
14622,MR00112752,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,781.24,0.271691,0.015658,0.256033


Imputer can be saved and re-loaded ✅

In [38]:
# perform feature engineering
X_train = comb_cols(imputed_X_train)
X_valid = comb_cols(imputed_X_valid)

In [39]:
# perform feature selection
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [40]:
# check the number of categorical columns to OHE
X_train.dtypes.value_counts()

int64      422
float64    201
object       3
dtype: int64

Check to see if a fit `OHE` can be saved and re-loaded:

In [41]:
def create_save_ohe(X_train, to_encode=['income_class', 'density_class', 'climate'], filename='ohe.joblib'):
    
    """
    Return an one-hot-encoder fitted using `X_train`
    as a pickle file.
    
    Parameters
    ----------
    X_train: pd.DataFrame
        Training set
        
    to_encode: list 
        The list of the categorical variables we want to encode
        
    filename: str
        Filename to use to save encoder
    
    Returns
    -------
    bytes
    """
    ohe = OneHotEncoder(sparse=False, dtype=int)
    
    ohe.fit(X_train.loc[:, to_encode])
    
    # Save the OHE
    # pickled_ohe = pickle.dumps(ohe)
    pickled_ohe = dump(ohe, filename)
    
    return None

In [42]:
# Check that saving and loading works
create_save_ohe(X_train)
ohe = load('ohe.joblib')
type(ohe)

sklearn.preprocessing._encoders.OneHotEncoder

In [19]:
def ohe_data(X, to_encode=['income_class', 'density_class', 'climate'], filename='ohe.joblib'):
    """
    Given an one-hot-encoder fit on `X_train` and
    a list of columns to encode, return a data frame.
    
    WARNING: `to_encode` must match list passed to create `ohe`

    Parameters
    ----------
    X: pd.DataFrame
        `X_train`, `X_valid` or `X_test`
    
    to_encode: list
        List of categorical variables to encode
        
    filename: str
        Filename of fitted encoder
    
    Returns
    -------
    pd.DataFrame
    
    """
    X_output = X.copy()
    
    ohe = load(filename)
    sub_X_output = ohe.transform(X_output.loc[:, to_encode])
    
    # get names of encoded columns
    ohe_cols = np.concatenate(ohe.categories_).ravel()
    
    # create data frames containing encoded columns (preserve old row indices)
    sub_X_output = pd.DataFrame(sub_X_output, index=X.index, columns=ohe_cols)
    
    # concatenate with existing data frame
    full_X_output = pd.concat((X_output, sub_X_output), axis=1)

    # drop the columns for which we used OHE
    full_X_output = full_X_output.drop(columns=to_encode)

    #Check that the number of rows is unchanged
    assert full_X_output.shape[0] == X_output.shape[0]

    #Check that `income_class` column is not in `output_data`
    assert 'income_class' not in full_X_output.columns.to_list()

    return full_X_output

In [20]:
X_train = ohe_data(X_train)
X_valid = ohe_data(X_valid)

In [21]:
X_train.head()

Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,avg_fertility_rate,HI,LI,MI,HD,LD,MD,A,C,D
49288,2,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,63.82875,1,0,0,0,1,0,0,0,1
47981,2,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,55.431875,0,0,1,1,0,0,0,1,0
506,8,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,66.70625,0,0,1,1,0,0,0,1,0
27843,7,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,73.296875,0,0,1,0,0,1,1,0,0
30294,10,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,54.97,1,0,0,0,1,0,0,1,0


In [22]:
X_valid.head()

Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,avg_fertility_rate,HI,LI,MI,HD,LD,MD,A,C,D
45913,5,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,46.800625,0,0,1,1,0,0,0,1,0
5319,9,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,54.676875,1,0,0,0,1,0,0,1,0
42344,9,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,62.876875,0,0,1,0,1,0,0,1,0
22466,6,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,54.876875,1,0,0,1,0,0,0,1,0
14622,9,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,73.296875,1,0,0,0,1,0,1,0,0


Check that a `XGBoost` can be saved and re-loaded:

In [23]:
xgb_test = XGBRegressor()
xgb_test.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
dump(xgb_test, 'xgb_test.joblib')

['xgb_test.joblib']

In [25]:
loaded_xgb = load('xgb_test.joblib')
loaded_xgb.predict(X_valid)

array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5], dtype=float32)

# Summary

Imputer, encoder, and regression model can be saved and re-loaded using `joblib`!