### NoteBook Contents :
        > 1. Loading the Standard Scaled training data
        > 2. HyperParameter Tuning using Randomizaed Grid Search 5 Fold Cross Validation
        > 3. Training Best Classifier on the whole Trainig Data
        > 4. Saving the trained Model into the subdirectory

#### Importing Essential Data handling libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

import time
from math import floor
from time import perf_counter
from tqdm import tqdm

pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

from sklearn.metrics import f1_score

In [3]:
from sklearn.ensemble import RandomForestClassifier
import joblib

### 1. Loading the Standard Scaled training data

In [4]:
train_df = pd.read_csv("..\\1_Modeling_Data_After_Transformations_TestTrainSplits\\Training_data_std.csv") ## looking for dataset up one directory

In [5]:
train_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,-0.286,0.811,0.182,0.847,-0.812,-0.468,-0.398,-0.383,-0.339,-0.309,-0.317,-0.691,-0.589,-0.671,-0.61,-0.654,-0.648,0.101,-0.231,-0.063,-0.27,-0.286,-0.146,0
1,1.414,-1.233,-1.08,0.847,1.787,-0.468,-0.398,-0.383,-0.339,-0.309,-0.317,-0.687,-0.661,-0.666,-0.66,-0.644,-0.566,-0.205,-0.217,-0.243,-0.232,0.029,-0.011,0
2,0.719,-1.233,0.182,0.847,-0.379,0.842,-0.398,-0.383,-0.339,-0.309,-0.317,-0.648,-0.693,-0.677,-0.644,-0.647,-0.654,-0.341,-0.247,-0.184,-0.241,-0.311,-0.292,0
3,0.873,0.811,0.182,0.847,-0.596,-0.468,-0.398,-0.383,-0.339,-0.309,-0.317,0.14,0.06,-0.423,-0.674,-0.574,-0.654,0.13,-0.074,-0.291,0.04,-0.311,-0.273,0
4,-0.672,0.811,0.182,-1.07,-1.137,-0.468,-0.398,-0.383,-0.339,-0.309,-0.317,0.342,0.388,-0.014,0.076,0.129,0.155,-0.204,-0.156,-0.114,-0.178,-0.182,-0.179,0


### 2. HyperParameter Tuning using Randomizaed Search Cross Validation

In [6]:
target = "default payment next month"

In [7]:
X = train_df.drop(target, axis = 1).values
y = train_df[target].values

##### Defining HyperParameter grid space and Fit classifier with 5 fold cross validation

In [8]:
# Number of trees in random forest
n_estimators = [250, 300, 400, 500]

# Maximum number of levels in tree
max_depth = [10, 15 ,20, 25, 30]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [9]:
param_grid

{'n_estimators': [250, 300, 400, 500],
 'max_depth': [10, 15, 20, 25, 30],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [10]:
classifier = RandomForestClassifier(n_jobs = -1) # utilizing all the cores

model = model_selection.RandomizedSearchCV(estimator = classifier,
                                           param_distributions = param_grid,
                                           n_iter = 10,
                                           scoring = "f1", 
                                           verbose = 1,
                                           cv = 3,
                                           n_jobs = -1)

model.fit(X,y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 15, 20, 25, 30],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [250, 300, 400, 500]},
                   scoring='f1', verbose=1)

In [11]:
print(model.best_score_)
print(model.best_estimator_)

0.4641727161334131
RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=500,
                       n_jobs=-1)


### 3. Training Best Classifier on the whole Trainig Data

In [12]:
final_model = model.best_estimator_

In [13]:
final_model.fit(X,y)

RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=500,
                       n_jobs=-1)

### 4. Saving the trained Model into the subdirectory

In [16]:
# save the model to disk
model_name = 'Random_Forest_Model_standard_scaled_data_v1.sav'
joblib.dump(final_model, "..\\3_Saved_Models"+"\\"+model_name)

['..\\3_Saved_Models\\Random_Forest_Model_standard_scaled_data_v1.sav']