# Import Libraries

In [130]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import time
import random
random.seed(42)
np.random.seed(42)

**Note:** To speed up running of model fit cells, the n_jobs parameter is set to a global variable. This can be set to -1 to allow access to the entirety of your machine's CPU resources for operations in this notebook (where applicable). This ***will*** make your machine essentially unusable until the operations have finished performing. Set this to None for regular operations

In [131]:
Notebook_n_jobs = 5

# Helper Functions

In [132]:
def getTotalFits(params, cv=5):
    """
    Helps us quickly get a total number of fits a grid search is gonna run for any given configuration of parameters
    """
    if params:
        num_fits =1
        for k in params.keys():
            num_fits *= len(params[k])
        return cv*num_fits
    else:
        return 0

In [133]:
def compareGridCVResults(GSCVModel_ss,GSCVModel_mm):
    '''
    Tabulates results and compares 2 grid search.
    Ranks by accuracy
    Shows all 4 mean test metrics: Accuracy, Precision Macro, Recall Macro, F1-score Macro
    Shows all parameters used for that model
    '''
    
    
    table_ss = pd.DataFrame(GSCVModel_ss.cv_results_)[['params','mean_test_accuracy','mean_test_f1_macro']]
    table_mm = pd.DataFrame(GSCVModel_mm.cv_results_)[['params','mean_test_accuracy','mean_test_f1_macro']]

    table_ss['params'] = table_ss['params'].astype(str) # transorm dict to str
    table_mm['params'] = table_mm['params'].astype(str) # transorm dict to str

    return pd.merge(left = table_ss,
                     right = table_mm,
                     on = 'params',
                    suffixes=('_ss','_mm'))

# Load Data

**Note:** The file names suggest this is only the training set, and this is true. This dataset is originally part of a competition and the test label set was not provided. To help with validating our findings and making our final assessments, the raw training set will be the only data considered since we have labels associated with it.

We will also, reduce the overall dataset. This will reduce the overall complexity but also, even though the primary reason is to have our models run with a reasonable runtime, I wanted to employ some level of sincerity when it comes to paring down the dataset. Emulating a series of decisions a data scientist would probably make prior to modeling.

In [134]:
X_train = pd.read_csv('data/Training_set.csv')
X_train = X_train[X_train.longitude != 0 ]
X_train.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3622
gps_height                   0
installer                 3636
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            2976
recorded_by                  0
scheme_management         3750
scheme_name              26692
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

Lets reduce the overall number of columns. There are a lot of columns that have redundant information (i.e. certain columns are just binned values of other columns). Aside from these we are going to drop a few others because there are either too many zeroes, unknowns, too many unique values to One Hot Encode etc.

In [135]:
X_train.drop(columns=[
    'id',
    'date_recorded',
    'gps_height',
    'amount_tsh',
    'scheme_name',
    'public_meeting',
    'quality_group',
    'quantity_group',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',  
    'construction_year',  
    'funder',
    'installer',
    'wpt_name',
    'subvillage',
    'region_code',
    'num_private',
    'ward',
    'recorded_by',
    'management_group',
    'source_type',
    'source_class',    
    'payment_type'
],inplace=True)

In [136]:
X_train.isna().sum()

longitude               0
latitude                0
basin                   0
region                  0
district_code           0
lga                     0
population              0
scheme_management    3750
permit               3056
extraction_type         0
management              0
payment                 0
water_quality           0
quantity                0
source                  0
waterpoint_type         0
dtype: int64

In [137]:
y_train = pd.read_csv('data/Training_labels.csv').iloc[X_train.index]['status_group']
y_train

0            functional
1            functional
2            functional
3        non functional
4            functional
              ...      
59395        functional
59396        functional
59397        functional
59398        functional
59399        functional
Name: status_group, Length: 57588, dtype: object

Some column still have unknowns and we will employ a simple imputer to get these resolved.

In [138]:
preprocess_pipe = Pipeline(steps=[
    ('SimpleImpute1', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('SimpleImpute2', SimpleImputer(missing_values='unknown', strategy='most_frequent')),
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

obj_cols = list(X_train.select_dtypes('object').columns)
obj_cols.append('district_code')

# Recording computation timing

We will be using the timeit library to record how long it takes to run a cell using standard scaling and minmaxscaling pipelines.
Note all values will be in seconds

In [10]:
timing_tables = {}

In [61]:
score_tables = {}

# Comparing just scaling

In [11]:
ct = ColumnTransformer(transformers=[('Preprocess', preprocess_pipe, obj_cols)],
                       remainder='passthrough')
X_train_processed = ct.fit_transform(X_train)

In [12]:
ohe_col_names = ct.named_transformers_['Preprocess']['OHE'].get_feature_names_out().tolist()
ohe_col_names

num_col_names = list(X_train.select_dtypes(np.number).columns)
num_col_names.remove('district_code')

col_names = ohe_col_names+num_col_names

In [13]:
X_train_processed = pd.DataFrame(X_train_processed,columns=col_names)
X_train_processed

Unnamed: 0,x0_Internal,x0_Lake Nyasa,x0_Lake Rukwa,x0_Lake Tanganyika,x0_Lake Victoria,x0_Pangani,x0_Rufiji,x0_Ruvuma / Southern Coast,x0_Wami / Ruvu,x1_Arusha,x1_Dar es Salaam,x1_Dodoma,x1_Iringa,x1_Kagera,x1_Kigoma,x1_Kilimanjaro,x1_Lindi,x1_Manyara,x1_Mara,x1_Mbeya,x1_Morogoro,x1_Mtwara,x1_Mwanza,x1_Pwani,x1_Rukwa,x1_Ruvuma,x1_Shinyanga,x1_Singida,x1_Tabora,x1_Tanga,x2_Arusha Rural,x2_Arusha Urban,x2_Babati,x2_Bagamoyo,x2_Bahi,x2_Bariadi,x2_Biharamulo,x2_Bukoba Rural,x2_Bukoba Urban,x2_Bukombe,x2_Bunda,x2_Chamwino,x2_Chato,x2_Chunya,x2_Dodoma Urban,x2_Hai,x2_Hanang,x2_Handeni,x2_Igunga,x2_Ilala,x2_Ileje,x2_Ilemela,x2_Iramba,x2_Iringa Rural,x2_Kahama,x2_Karagwe,x2_Karatu,x2_Kasulu,x2_Kibaha,x2_Kibondo,x2_Kigoma Rural,x2_Kigoma Urban,x2_Kilindi,x2_Kilolo,x2_Kilombero,x2_Kilosa,x2_Kilwa,x2_Kinondoni,x2_Kisarawe,x2_Kishapu,x2_Kiteto,x2_Kondoa,x2_Kongwa,x2_Korogwe,x2_Kwimba,x2_Kyela,x2_Lindi Rural,x2_Lindi Urban,x2_Liwale,x2_Longido,x2_Ludewa,x2_Lushoto,x2_Mafia,x2_Magu,x2_Makete,x2_Manyoni,x2_Masasi,x2_Maswa,x2_Mbarali,x2_Mbeya Rural,x2_Mbinga,x2_Mbozi,x2_Mbulu,x2_Meatu,x2_Meru,x2_Misenyi,x2_Missungwi,x2_Mkinga,x2_Mkuranga,x2_Monduli,x2_Morogoro Rural,x2_Morogoro Urban,x2_Moshi Rural,x2_Moshi Urban,x2_Mpanda,x2_Mpwapwa,x2_Mtwara Rural,x2_Mtwara Urban,x2_Mufindi,x2_Muheza,x2_Muleba,x2_Musoma Rural,x2_Mvomero,x2_Mwanga,x2_Nachingwea,x2_Namtumbo,x2_Nanyumbu,x2_Newala,x2_Ngara,x2_Ngorongoro,x2_Njombe,x2_Nkasi,x2_Nyamagana,x2_Nzega,x2_Pangani,x2_Rombo,x2_Rorya,x2_Ruangwa,x2_Rufiji,x2_Rungwe,x2_Same,x2_Sengerema,x2_Serengeti,x2_Shinyanga Rural,x2_Shinyanga Urban,x2_Siha,x2_Sikonge,x2_Simanjiro,x2_Singida Rural,x2_Singida Urban,x2_Songea Rural,x2_Songea Urban,x2_Sumbawanga Rural,x2_Sumbawanga Urban,x2_Tabora Urban,x2_Tandahimba,x2_Tanga,x2_Tarime,x2_Temeke,x2_Tunduru,x2_Ukerewe,x2_Ulanga,x2_Urambo,x2_Uyui,x3_Company,x3_None,x3_Other,x3_Parastatal,x3_Private operator,x3_SWC,x3_Trust,x3_VWC,x3_WUA,x3_WUG,x3_Water Board,x3_Water authority,x4_False,x4_True,x5_afridev,x5_cemo,x5_climax,x5_gravity,x5_india mark ii,x5_india mark iii,x5_ksb,x5_mono,x5_nira/tanira,x5_other,x5_other - mkulima/shinyanga,x5_other - play pump,x5_other - rope pump,x5_other - swn 81,x5_submersible,x5_swn 80,x5_walimi,x5_windmill,x6_company,x6_other,x6_other - school,x6_parastatal,x6_private operator,x6_trust,x6_vwc,x6_water authority,x6_water board,x6_wua,x6_wug,x7_never pay,x7_other,x7_pay annually,x7_pay monthly,x7_pay per bucket,x7_pay when scheme fails,x8_coloured,x8_fluoride,x8_fluoride abandoned,x8_milky,x8_salty,x8_salty abandoned,x8_soft,x9_dry,x9_enough,x9_insufficient,x9_seasonal,x10_dam,x10_hand dtw,x10_lake,x10_machine dbh,x10_other,x10_rainwater harvesting,x10_river,x10_shallow well,x10_spring,x11_cattle trough,x11_communal standpipe,x11_communal standpipe multiple,x11_dam,x11_hand pump,x11_improved spring,x11_other,x12_0,x12_1,x12_2,x12_3,x12_4,x12_5,x12_6,x12_7,x12_8,x12_13,x12_23,x12_30,x12_33,x12_43,x12_53,x12_60,x12_62,x12_63,x12_67,x12_80,longitude,latitude,population
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.938093,-9.856322,109.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.698766,-2.147466,280.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.460664,-3.821329,250.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,38.486161,-11.155298,58.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.130847,-1.825359,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57583,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.169807,-3.253847,125.0
57584,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.249991,-9.070629,56.0
57585,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.017087,-8.750434,0.0
57586,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.861315,-6.378573,0.0


In [14]:
standard_scl_pipe = Pipeline(steps=[
    ('Std_pipe', StandardScaler())
])

minmax_scl_pipe = Pipeline(steps=[
    ('MM_pipe', ColumnTransformer(transformers=[('MM_scl',
                                                MinMaxScaler(
                                                    feature_range=(0, 1)),
                                                ['longitude', 'latitude', 'population'])],
                                  remainder='passthrough'))
])

In [15]:
#Arbitrary number of loops
max_count = 200

start_ss = time.time()
for i in range(max_count):
    X_train_ss = standard_scl_pipe.fit_transform(X_train_processed)
end_ss = time.time()
time_ss = end_ss-start_ss


start_mm = time.time()
for i in range(max_count):
    X_train_mm = minmax_scl_pipe.fit_transform(X_train_processed)
end_mm = time.time()
time_mm = end_mm-start_mm


time_ss,time_mm

timing_tables['Just_Scaling'] = {'ss':time_ss,'mm':time_mm}

In [62]:
score_tables['Just_Scaling']  = {'ss':np.nan,'mm':np.nan}

# Comparing just modeling

In [16]:
log_reg_ss_pipe_justmodeling = Pipeline([
    ('log_reg', LogisticRegression(random_state=42, n_jobs=Notebook_n_jobs))
])

log_reg_mm_pipe_justmodeling = Pipeline([
    ('log_reg', LogisticRegression(random_state=42, n_jobs=Notebook_n_jobs))
])
params = {}
params['log_reg__C'] = [0.01, 0.1, 1, 10]
params['log_reg__max_iter'] = [200,300,400]
params['log_reg__solver'] = ['lbfgs', 'sag']

In [17]:
getTotalFits(params)

120

In [18]:
start_ss = time.time()
log_reg_ss_justmodeling = GridSearchCV(estimator=log_reg_ss_pipe_justmodeling,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
log_reg_ss_justmodeling.fit(X_train_ss, y_train)
end_ss = time.time()
time_ss = end_ss-start_ss

start_mm = time.time()
log_reg_mm_justmodeling = GridSearchCV(estimator=log_reg_mm_pipe_justmodeling,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
log_reg_mm_justmodeling.fit(X_train_mm, y_train)
end_mm = time.time()
time_mm = end_mm-start_mm



In [19]:
timing_tables['Prescale_LR'] = {'ss':time_ss,'mm':time_mm}

In [20]:
log_reg_ss_justmodeling.score(X_train_ss,y_train)

0.7489754810029867

In [21]:
log_reg_mm_justmodeling.score(X_train_mm,y_train)

0.7489754810029867

In [22]:
compareGridCVResults(log_reg_ss_justmodeling,log_reg_mm_justmodeling)

Unnamed: 0,params,mean_test_accuracy_ss,mean_test_f1_macro_ss,mean_test_accuracy_mm,mean_test_f1_macro_mm
0,"{'log_reg__C': 0.01, 'log_reg__max_iter': 200,...",0.747187,0.561876,0.740832,0.530782
1,"{'log_reg__C': 0.01, 'log_reg__max_iter': 200,...",0.747204,0.561884,0.740814,0.530767
2,"{'log_reg__C': 0.01, 'log_reg__max_iter': 300,...",0.747187,0.561876,0.740832,0.530782
3,"{'log_reg__C': 0.01, 'log_reg__max_iter': 300,...",0.747204,0.561884,0.740814,0.530767
4,"{'log_reg__C': 0.01, 'log_reg__max_iter': 400,...",0.747187,0.561876,0.740832,0.530782
5,"{'log_reg__C': 0.01, 'log_reg__max_iter': 400,...",0.747204,0.561884,0.740814,0.530767
6,"{'log_reg__C': 0.1, 'log_reg__max_iter': 200, ...",0.746631,0.563835,0.746076,0.555008
7,"{'log_reg__C': 0.1, 'log_reg__max_iter': 200, ...",0.74677,0.563956,0.746006,0.554954
8,"{'log_reg__C': 0.1, 'log_reg__max_iter': 300, ...",0.746631,0.563831,0.746093,0.555026
9,"{'log_reg__C': 0.1, 'log_reg__max_iter': 300, ...",0.746718,0.563787,0.746006,0.554954


In [24]:
timing_tables

{'Just_Scaling': {'ss': 32.596790075302124, 'mm': 15.280414819717407},
 'Prescale_LR': {'ss': 746.2070355415344, 'mm': 579.4661359786987}}

In [63]:
score_tables['Prescale_LR']  = {'ss':log_reg_ss_justmodeling.score(X_train_ss,y_train),
                                'mm':log_reg_mm_justmodeling.score(X_train_mm,y_train)}

# Comparing scaling and modeling

## Logistic Regression

In [118]:
log_reg_ss_pipe = Pipeline([
    ('std_scl', standard_scl_pipe),
    ('log_reg', LogisticRegression(random_state=42, n_jobs=Notebook_n_jobs))
])

log_reg_mm_pipe = Pipeline([
    ('mm_scl', minmax_scl_pipe),
    ('log_reg', LogisticRegression(random_state=42, n_jobs=Notebook_n_jobs))
])
params = {}
params['log_reg__C'] = [0.01, 0.1, 1, 10]
params['log_reg__max_iter'] = [200,300,400]
params['log_reg__solver'] = ['lbfgs', 'sag']

In [119]:
getTotalFits(params)

120

In [120]:
start_ss = time.time()
log_reg_ss = GridSearchCV(estimator=log_reg_ss_pipe,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
log_reg_ss.fit(X_train_processed, y_train)
end_ss = time.time()
time_ss = end_ss-start_ss


start_mm = time.time()
log_reg_mm = GridSearchCV(estimator=log_reg_mm_pipe,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
log_reg_mm.fit(X_train_processed, y_train)
end_mm = time.time()
time_mm = end_mm-start_mm

In [121]:
timing_tables['Scale_LR'] = {'ss':time_ss,'mm':time_mm}

In [122]:
log_reg_ss.score(X_train_processed,y_train)

0.7490102104605126

In [123]:
log_reg_mm.score(X_train_processed,y_train)

0.749183857748142

In [124]:
score_tables['Scale_LR'] = {'ss': log_reg_ss.score(X_train_processed, y_train),
                            'mm': log_reg_mm.score(X_train_processed, y_train)}

In [125]:
compareGridCVResults(log_reg_ss,log_reg_mm)

Unnamed: 0,params,mean_test_accuracy_ss,mean_test_f1_macro_ss,mean_test_accuracy_mm,mean_test_f1_macro_mm
0,"{'log_reg__C': 0.01, 'log_reg__max_iter': 200,...",0.747187,0.561877,0.740814,0.530765
1,"{'log_reg__C': 0.01, 'log_reg__max_iter': 200,...",0.747152,0.561849,0.740814,0.530765
2,"{'log_reg__C': 0.01, 'log_reg__max_iter': 300,...",0.747187,0.561877,0.740814,0.530765
3,"{'log_reg__C': 0.01, 'log_reg__max_iter': 300,...",0.74717,0.561863,0.740814,0.530765
4,"{'log_reg__C': 0.01, 'log_reg__max_iter': 400,...",0.747187,0.561877,0.740814,0.530765
5,"{'log_reg__C': 0.01, 'log_reg__max_iter': 400,...",0.74717,0.561863,0.740814,0.530765
6,"{'log_reg__C': 0.1, 'log_reg__max_iter': 200, ...",0.746683,0.563883,0.74618,0.555076
7,"{'log_reg__C': 0.1, 'log_reg__max_iter': 200, ...",0.746788,0.563977,0.746197,0.555095
8,"{'log_reg__C': 0.1, 'log_reg__max_iter': 300, ...",0.746683,0.563883,0.746215,0.555112
9,"{'log_reg__C': 0.1, 'log_reg__max_iter': 300, ...",0.746718,0.563774,0.746197,0.555095


## Stochastic Gradient Descent

In [32]:
sgd_std_pipe = Pipeline([
    ('std_scl', standard_scl_pipe),
    ('sgd', SGDClassifier(n_jobs=Notebook_n_jobs,random_state=42))
])

sgd_mm_pipe = Pipeline([
    ('mm_scl', minmax_scl_pipe),
    ('sgd', SGDClassifier(n_jobs=Notebook_n_jobs,random_state=42))
])

params = {}
params['sgd__alpha'] = [0.00001,0.0001,0.001]
params['sgd__l1_ratio'] = [0.1,0.15,0.2]
params['sgd__max_iter'] = [800,1000,1500]
params['sgd__tol'] = [0.001,0.01,0.1]


In [33]:
getTotalFits(params)

405

In [34]:
start_ss = time.time()
sgd_ss = GridSearchCV(estimator=sgd_std_pipe,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
sgd_ss.fit(X_train_processed, y_train)
end_ss = time.time()
time_ss = end_ss-start_ss


start_mm = time.time()
sgd_mm = GridSearchCV(estimator=sgd_mm_pipe,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
sgd_mm.fit(X_train_processed, y_train)
end_mm = time.time()
time_mm = end_mm-start_mm

In [35]:
timing_tables['Scale_SGD'] = {'ss':time_ss,'mm':time_mm}

In [36]:
sgd_ss.score(X_train_processed,y_train)

0.7424637077168854

In [37]:
sgd_mm.score(X_train_processed,y_train)

0.7419254011252344

In [65]:
score_tables['Scale_SGD'] = {'ss': sgd_ss.score(X_train_processed, y_train),
                            'mm': sgd_mm.score(X_train_processed, y_train)}

In [38]:
compareGridCVResults(sgd_ss,sgd_mm)

Unnamed: 0,params,mean_test_accuracy_ss,mean_test_f1_macro_ss,mean_test_accuracy_mm,mean_test_f1_macro_mm
0,"{'sgd__alpha': 1e-05, 'sgd__l1_ratio': 0.1, 's...",0.706866,0.540694,0.738418,0.528576
1,"{'sgd__alpha': 1e-05, 'sgd__l1_ratio': 0.1, 's...",0.690786,0.536745,0.719507,0.522457
2,"{'sgd__alpha': 1e-05, 'sgd__l1_ratio': 0.1, 's...",0.685090,0.529242,0.705494,0.551168
3,"{'sgd__alpha': 1e-05, 'sgd__l1_ratio': 0.1, 's...",0.706866,0.540694,0.738418,0.528576
4,"{'sgd__alpha': 1e-05, 'sgd__l1_ratio': 0.1, 's...",0.690786,0.536745,0.719507,0.522457
...,...,...,...,...,...
76,"{'sgd__alpha': 0.001, 'sgd__l1_ratio': 0.2, 's...",0.733330,0.539180,0.734667,0.512429
77,"{'sgd__alpha': 0.001, 'sgd__l1_ratio': 0.2, 's...",0.731837,0.532476,0.733052,0.495200
78,"{'sgd__alpha': 0.001, 'sgd__l1_ratio': 0.2, 's...",0.738209,0.520337,0.732618,0.499870
79,"{'sgd__alpha': 0.001, 'sgd__l1_ratio': 0.2, 's...",0.733330,0.539180,0.734667,0.512429


## XGBoost

In [39]:
xg_std_pipe = Pipeline([
    ('std_scl', standard_scl_pipe),
    ('xg', xgboost.XGBRFClassifier(n_jobs=Notebook_n_jobs))
])

xg_mm_pipe = Pipeline([
    ('mm_scl', minmax_scl_pipe),
    ('xg', xgboost.XGBRFClassifier(n_jobs=Notebook_n_jobs))
])

params = {}
params['xg__learning_rate'] = [0.1,1,10]
params['xg__n_estimators'] = [5,10,15]
params['xg__max_depth'] = [15,20,25]
params['xg__max_leafs'] = [3000,3500]


In [40]:
getTotalFits(params)

270

In [46]:
start_ss = time.time()
xg_ss = GridSearchCV(estimator=xg_std_pipe,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
xg_ss.fit(X_train_processed, y_train)
end_ss = time.time()
time_ss = end_ss-start_ss



start_mm = time.time()
xg_mm = GridSearchCV(estimator=xg_mm_pipe,
                           param_grid=params,
                           cv=5,
                           n_jobs=Notebook_n_jobs,
                           scoring=[ 'accuracy','precision_macro','recall_macro','f1_macro'],
                           refit='accuracy',
                           return_train_score=True)
xg_mm.fit(X_train_processed, y_train)
end_mm = time.time()
time_mm = end_mm-start_mm

Parameters: { max_leafs } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { max_leafs } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [47]:
timing_tables['Scale_XGB'] = {'ss':time_ss,'mm':time_mm}

In [48]:
xg_ss.score(X_train_processed,y_train)

0.9170486906994513

In [49]:
xg_mm.score(X_train_processed,y_train)

0.9196881294714176

In [66]:
score_tables['Scale_XGB'] = {'ss': xg_ss.score(X_train_processed, y_train),
                             'mm': xg_mm.score(X_train_processed, y_train)}

In [128]:
timing_df = pd.DataFrame.from_dict(timing_tables).T.rename(columns = {'ss':'Execution Time: StandardScaler',
                                                'mm':'Execution Time: MinMaxScaler'})
timing_dfscores_df = pd.DataFrame.from_dict(score_tables).T.rename(columns = {'ss':'GridSearch Scores: StandardScaler',
                                                'mm':'GridSearch Scores: MinMaxScaler'})

final_df = pd.DataFrame.join(timing_df,scores_df)

In [129]:
final_df.to_csv('data/tableExport.csv')