# Python Basecamp

### Karthikeyan Sankaran, 17th June, 2018

### End to End ML Pipeline [BigMart Sales]: Notebook 4 - Running Algorithms & Predict on Test Set

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

import pandas as pd
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)

import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline,FeatureUnion

from sklearn.feature_selection import SelectKBest

from sklearn.base import TransformerMixin, BaseEstimator
from generic_transformers import *
from utility_functions import (Find_Optimal_Cutoff,_get_feature_importances)

from category_encoders import (BackwardDifferenceEncoder,BinaryEncoder,HashingEncoder,HelmertEncoder,
                               OneHotEncoder,OrdinalEncoder,SumEncoder,PolynomialEncoder)

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error as mse

from pprint import pprint
import time

In [2]:
def rmse(act_y, pred_y):
    return np.sqrt(mse(act_y, pred_y))

In [3]:
# Driver parameters

#1. Scoring
#scoring = 'neg_mean_squared_error'
#scoring = 'neg_mean_absolute_error'
#scoring = 'neg_median_absolute_error'
scoring = 'r2'

#2. Folds
cv_indicator = 'kfold' # can be - 'shuffle_split','kfold','time', 'rkfold', 'loo', 'skfold', 's_shuffle_split'
n_splits=5
seed = 10

#3. Tuning
tuning_required = "Yes"
search = 'grid_search'

#4. Target
target = 'Item_Outlet_Sales'

In [4]:
%store -r train_feats
%store -r test_feats
%store -r target_df

In [5]:
train_df = train_feats.copy()
test_df = test_feats.copy()

In [6]:
print (train_df.shape, test_df.shape, target_df.shape)

(8523, 62) (5681, 62) (8523, 4)


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8523 entries, 0 to 8522
Data columns (total 62 columns):
Item_Fat_Content                                 8523 non-null object
Item_Identifier                                  8523 non-null object
Item_MRP                                         8523 non-null float64
Item_Outlet_Sales                                8523 non-null float64
Item_Type                                        8523 non-null object
Item_Visibility                                  8523 non-null float64
Item_Weight                                      7060 non-null float64
Outlet_Establishment_Year                        8523 non-null int64
Outlet_Identifier                                8523 non-null object
Outlet_Location_Type                             8523 non-null object
Outlet_Size                                      6113 non-null object
Outlet_Type                                      8523 non-null object
source                                           8

** Pipeline - Level 0 - For evaluating multiple feature sets & algorithms **

In [8]:
model_regressors = [
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    XGBRegressor(),
    LinearRegression(),
    KNeighborsRegressor()
    ]

In [9]:
features = []

feature_set_0 = []  # Just to ensure that feature set 1 starts with index 1 - for convenience
feature_set_1 = ['Item_Identifier_mean_enc_fold','Outlet_Identifier_mean_enc_fold','Item_MRP_clipped_tf','PriceLevels_lb_enc']
feature_set_2 = ['Item_Identifier_mean_enc_fold','Outlet_Identifier_mean_enc_fold','Item_MRP_clipped','Item_Identifier_freq']

features.append(feature_set_0)
features.append(feature_set_1)
features.append(feature_set_2)

In [10]:
if 1==1:
    model_pipeline = Pipeline([
        ('reg', model_regressors[0])
    ])

In [11]:
if 1==1:
    
    if cv_indicator == "kfold":
        cv = KFold(n_splits=5, shuffle=True, random_state=seed)
    elif cv_indicator == "skfold":
        cv = StratifiedKFold(n_splits=3,shuffle=False,random_state=seed)
    elif cv_indicator == 'shuffle_split':
        cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=seed)
    elif cv_indicator == 's_shuffle_split':
        cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=seed)
    elif cv_indicator == "time":
        cv = TimeSeriesSplit(n_splits=3)
    
    for i in range(1,(len(features))):
        X = train_df[features[i]]
        y = target_df[target]
    
        for reg in model_regressors:
            print("FEATURE SET:%s" %i)
            print("Cross-Validation Type:")
            print(cv_indicator)
            model_pipeline.set_params(reg=reg)
            scores = cross_val_score(model_pipeline, X, y, scoring=scoring, cv=cv)
            print('----------------------')
            print(features[i])
            print('----------------------')
            print(str(reg))
            print('----------------------')
            print(scores)
            print(scores.mean())
            print('----------------------')

FEATURE SET:1
Cross-Validation Type:
kfold
----------------------
['Item_Identifier_mean_enc_fold', 'Outlet_Identifier_mean_enc_fold', 'Item_MRP_clipped_tf', 'PriceLevels_lb_enc']
----------------------
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
----------------------
[0.52891079 0.51530758 0.58376602 0.54510717 0.52931285]
0.5404808806294407
----------------------
FEATURE SET:1
Cross-Validation Type:
kfold
----------------------
['Item_Identifier_mean_enc_fold', 'Outlet_Identifier_mean_enc_fold', 'Item_MRP_clipped_tf', 'PriceLevels_lb_enc']
----------------------
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls'

** Pipeline - Level 1: Evaluating Specific Algos & Datasets before Hyperparameter Tuning **

In [12]:
algos_selected_for_tuning = ['GBM','XGB']
algo_features_dict = {"GBM": [1],"XGB":[2]}

In [13]:
for algo,feat_sets in algo_features_dict.items():
    for i in range(0,len(feat_sets)):
        print(algo, features[feat_sets[i]])

GBM ['Item_Identifier_mean_enc_fold', 'Outlet_Identifier_mean_enc_fold', 'Item_MRP_clipped_tf', 'PriceLevels_lb_enc']
XGB ['Item_Identifier_mean_enc_fold', 'Outlet_Identifier_mean_enc_fold', 'Item_MRP_clipped', 'Item_Identifier_freq']


In [14]:
if 1 == 1:
    
    ## Creating the param grid & the model
    pipelines_to_tune = []
    param_grid_to_tune = []
   
    # 1. GBM
    if 'GBM' in algos_selected_for_tuning:
        gbm_pipeline = Pipeline([
            ('reg', GradientBoostingRegressor())
        ])
        pipelines_to_tune.append(gbm_pipeline)

    # 2. XGB
    if 'XGB' in algos_selected_for_tuning:
        xgb_pipeline = Pipeline([
            ('reg', XGBRegressor())
        ])
        pipelines_to_tune.append(xgb_pipeline)
        
    # 3. RF
    if 'RF' in algos_selected_for_tuning:
        rf_pipeline = Pipeline([
            ('features', tree_all_features_pipeline),
            ('reg', RandomForestRegressor())
        ])
        pipelines_to_tune.append(rf_pipeline)
    
    # 4. Catboost
    if 'CAT' in algos_selected_for_tuning:
        cat_pipeline = Pipeline([
            ('reg', CatBoostRegressor())
        ])
        pipelines_to_tune.append(cat_pipeline)
        
    # 5. Linear Regression
    if 'LR' in algos_selected_for_tuning:
        lr_pipeline = Pipeline([
            ('reg', LinearRegression())
        ])
        pipelines_to_tune.append(lr_pipeline)     
    
    ## Get the parameters for models
    for i in range(0,len(pipelines_to_tune)):
        pipe_to_tune = pipelines_to_tune[i]
        #param_grid = param_grid_to_tune[i]
        print("------------------------")
        print("Algorithm: ",algos_selected_for_tuning[i])
        # Get all hyper-parameters for the pipeline object
        print(sorted(pipe_to_tune.get_params().keys()))
        print("------------------------")

        feat_sets = algo_features_dict[algo]   # Get the feature sets for each algo from dict
        
        for j in range(0,len(feat_sets)):      # Traverse through the feature sets
            X = train_df[features[feat_sets[j]]]
            y = target_df[target]
            
            print("------------------------")
            print(features[feat_sets[j]])
            print("------------------------")
            
            # Creating the train-test split
            X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=17)
        
            # use the pipeline object as you would a regular regressor
            pipe_to_tune.fit(X_train,y_train)
            print(pipe_to_tune)
            print("------------------------")

            y_preds = pipe_to_tune.predict(X_val)
            print("Performance before hyperparameter tuning: %0.2f" %rmse(y_val,y_preds))

            model_for_feature_importance = pipe_to_tune.named_steps['reg']
        
            if (algos_selected_for_tuning[i] == "XGB"):
                importances_xgb = model_for_feature_importance.booster().get_score(importance_type='weight')
                print("Feature ranking:")
                print(sorted(((value,key) for (key,value) in importances_xgb.items()),reverse=True))
                print("------------------------")
            else:
                importances = _get_feature_importances(model_for_feature_importance)
                indices = np.argsort(importances)[::-1]
                print(importances,indices)

                # Print the feature ranking
                print("Feature ranking:")
                print(X.transpose().index[indices])
                print("------------------------")

------------------------
Algorithm:  GBM
['reg', 'reg__alpha', 'reg__criterion', 'reg__init', 'reg__learning_rate', 'reg__loss', 'reg__max_depth', 'reg__max_features', 'reg__max_leaf_nodes', 'reg__min_impurity_split', 'reg__min_samples_leaf', 'reg__min_samples_split', 'reg__min_weight_fraction_leaf', 'reg__n_estimators', 'reg__presort', 'reg__random_state', 'reg__subsample', 'reg__verbose', 'reg__warm_start', 'steps']
------------------------
------------------------
['Item_Identifier_mean_enc_fold', 'Outlet_Identifier_mean_enc_fold', 'Item_MRP_clipped', 'Item_Identifier_freq']
------------------------
Pipeline(steps=[('reg', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=

** Pipeline - Level 2: Hyperparameter Tuning for selected algos & datasets **

In [15]:
## Creating the param grid & the model
pipelines_to_tune = []
param_grid_to_tune = []
best_models = []
best_pipelines = []
final_feature_set = []
algos = []

if tuning_required == "Yes":
   
    # 1. GBM
    if 'GBM' in algos_selected_for_tuning:
        gbm_pipeline = Pipeline([
            ('reg', GradientBoostingRegressor())
        ])
        
        param_grid_gbm = { 
                 'reg__n_estimators': [50],
                 'reg__max_depth': [2,4],
                 'reg__learning_rate':[0.01,0.1],
              }
        pipelines_to_tune.append(gbm_pipeline)
        param_grid_to_tune.append(param_grid_gbm)

    # 2. XGB
    if 'XGB' in algos_selected_for_tuning:
        xgb_pipeline = Pipeline([
            ('reg', XGBRegressor())
        ])
        param_grid_xgb = {
             'reg__n_estimators': [100],
             'reg__max_depth': [2,4],
             'reg__learning_rate':[0.1],
          }
        pipelines_to_tune.append(xgb_pipeline)
        param_grid_to_tune.append(param_grid_xgb)
        
    # 3. RF
    if 'RF' in algos_selected_for_tuning:
        rf_pipeline = Pipeline([
            ('reg', RandomForestRegressor())
        ])
        param_grid_rf = {
             'reg__n_estimators': [50,100,150],
             'reg__max_depth': [2,4],
          }
        pipelines_to_tune.append(rf_pipeline)
        param_grid_to_tune.append(param_grid_rf)
    
    ## Get the parameters for models
    for i in range(0,len(pipelines_to_tune)):
        pipe_to_tune = pipelines_to_tune[i]
        param_grid = param_grid_to_tune[i]
        algo = algos_selected_for_tuning[i]
        print("Algorithm: ",algo)
        
        if cv_indicator == "kfold":
            cv = KFold(n_splits=5, shuffle=True, random_state=seed)
        elif cv_indicator == "skfold":
            cv = StratifiedKFold(n_splits=3,shuffle=False,random_state=seed)
        elif cv_indicator == 'shuffle_split':
            cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=seed)
        elif cv_indicator == 's_shuffle_split':
            cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=seed)
        elif cv_indicator == "time":
            cv = TimeSeriesSplit(n_splits=3)
            
        if search == "grid_search":
            ## Grid Search
            print("[INFO] tuning hyperparameters via grid search")
            #grid = GridSearchCV(estimator=model_to_tune, param_grid=param_grid, scoring=scoring, cv=cv)
            grid = GridSearchCV(estimator=pipe_to_tune, param_grid=param_grid,scoring=scoring,cv=cv)
        elif search == "random_search":
            ## Random Search
            print("[INFO] tuning hyperparameters via random search")
            #grid = RandomizedSearchCV(estimator=model_to_tune, param_distributions=param_grid, scoring=scoring, cv=cv)
            grid = RandomizedSearchCV(estimator=pipe_to_tune, param_distributions=param_grid, scoring=scoring, cv=cv)

        print("pipeline:", [name for name, _ in pipe_to_tune.steps])
        print("parameters:")
        pprint(param_grid)
        start = time.time()
        
        feat_sets = algo_features_dict[algo]   # Get the feature sets for each algo from dict
        
        for j in range(0,len(feat_sets)):      # Traverse through the feature sets
            X = train_df[features[feat_sets[j]]]
            y = target_df[target]
            
            print("------------------------")
            print(features[feat_sets[j]])
            print("------------------------")
        
            # Appending feature set for each algo
            final_feature_set.append(features[feat_sets[j]])  # To be used in finalizing the pipelines
            algos.append(algo)                                # To be used in finalizing the pipelines
            
            grid_result = grid.fit(X, y)  # should it be X_train, Y_train or X,y??
            print("done in %0.3fs" % (time.time() - start))
            print()

            print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
            print("Best parameters set:")
            best_parameters = grid_result.best_estimator_.get_params()
            
            # print(grid_result.cv_results_)  # Need to use cv_results_ as grid_scores_ is deprecated in 0.20
            
            for params, mean_score, scores in grid_result.grid_scores_:
                print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
            print("[INFO] search took {:.2f} seconds".format(time.time() - start))

            #grid_result.refit
            best_estimator = grid_result.best_estimator_
            best_pipelines.append(best_estimator)
            best_models.append(best_estimator.named_steps['reg'])
            #preds = best_estimator.predict(X_val)
            #print("Performance after hyperparameter tuning: %0.2f" %rmse(y_val,preds))
            print("----------------------------------------")

Algorithm:  GBM
[INFO] tuning hyperparameters via grid search
pipeline: ['reg']
parameters:
{'reg__learning_rate': [0.01, 0.1],
 'reg__max_depth': [2, 4],
 'reg__n_estimators': [50]}
------------------------
['Item_Identifier_mean_enc_fold', 'Outlet_Identifier_mean_enc_fold', 'Item_MRP_clipped_tf', 'PriceLevels_lb_enc']
------------------------
done in 3.893s

Best score: 0.634595 using {'reg__n_estimators': 50, 'reg__learning_rate': 0.1, 'reg__max_depth': 4}
Best parameters set:
0.290492 (0.008921) with: {'reg__n_estimators': 50, 'reg__learning_rate': 0.01, 'reg__max_depth': 2}
0.381806 (0.007949) with: {'reg__n_estimators': 50, 'reg__learning_rate': 0.01, 'reg__max_depth': 4}
0.630830 (0.012885) with: {'reg__n_estimators': 50, 'reg__learning_rate': 0.1, 'reg__max_depth': 2}
0.634594 (0.016912) with: {'reg__n_estimators': 50, 'reg__learning_rate': 0.1, 'reg__max_depth': 4}
[INFO] search took 3.89 seconds
----------------------------------------
Algorithm:  XGB
[INFO] tuning hyperparam



done in 2.132s

Best score: 0.634200 using {'reg__n_estimators': 100, 'reg__learning_rate': 0.1, 'reg__max_depth': 2}
Best parameters set:
0.634198 (0.015490) with: {'reg__n_estimators': 100, 'reg__learning_rate': 0.1, 'reg__max_depth': 2}
0.628068 (0.018006) with: {'reg__n_estimators': 100, 'reg__learning_rate': 0.1, 'reg__max_depth': 4}
[INFO] search took 2.13 seconds
----------------------------------------




** Setting the final pipelines and fit to entire training set **

In [16]:
final_pipelines=[]

for i in range(0,len(best_pipelines)):
    pipe_for_final_prediction = best_pipelines[i]
    feature_set_for_final_prediction = final_feature_set[i]
    
    X = train_df[feature_set_for_final_prediction]
    y = target_df[target]
    
    print(pipe_for_final_prediction)
    pipe_for_final_prediction.fit(X,y)
    final_pipelines.append(pipe_for_final_prediction)

Pipeline(steps=[('reg', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False))])
Pipeline(steps=[('reg', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])


** Predict on Test Set **

In [17]:
# Creating the final predictions from each algo
test_predictions = pd.DataFrame()
#for i in range(0,len(algos_selected_for_tuning)):
for i in range(0,len(final_pipelines)):
    colname_preds = 'preds' + '_' + algos[i] + '_' + str(i)
    feature_set_for_final_prediction = final_feature_set[i]
    
    #preds = np.exp(final_pipelines[i].predict(X_test))
    X_test = test_df[feature_set_for_final_prediction]
    
    preds = final_pipelines[i].predict(X_test)    
    test_predictions[colname_preds] = preds

In [18]:
test_predictions.head()

Unnamed: 0,preds_GBM_0,preds_XGB_1
0,1758.694361,1710.765381
1,1429.756509,1422.193237
2,785.304116,754.769775
3,2074.811273,2106.047363
4,5592.207403,5704.916016


In [19]:
if len(final_pipelines) > 0:
    final_df = pd.DataFrame()
    final_df['Item_Identifier'] = test_df['Item_Identifier']
    final_df['Outlet_Identifier'] = test_df['Outlet_Identifier']
    final_df['Item_Outlet_Sales'] = test_predictions['preds_GBM_0'].values

In [20]:
final_df.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
8523,FDW58,OUT049,1758.694361
8524,FDW14,OUT017,1429.756509
8525,NCN55,OUT010,785.304116
8526,FDQ58,OUT017,2074.811273
8527,FDY38,OUT027,5592.207403


In [21]:
if 1==1:
    print(final_df['Item_Outlet_Sales'].describe().T)
    final_df.to_csv('./submit-bigmartsales-1.csv',index=False)

count    5681.000000
mean     2159.018504
std      1318.360338
min        30.367401
25%      1053.628876
50%      2069.704750
75%      3057.675615
max      7459.409902
Name: Item_Outlet_Sales, dtype: float64
