In [1]:
# All Imports
from utils import *
import pandas as pd
import locale
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.ensemble import RandomForestClassifier 

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
locale.setlocale(locale.LC_ALL,'')
pd.set_option('display.max_columns', None)

In [2]:
# Process Training Data
# drop_columns = ['CreateJob','RetainedJob','City','Name','Zip','BankState']

# drop_columns = ['CreateJob','ApprovalFY','ApprovalDate','DisbursementGross','SBA_Appv']

drop_columns = []

le = generate_labels()
sc = generate_scaler(le, preprocessing.MinMaxScaler())

base_dropna = get_data(scaler=sc,le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_test = get_data(scaler=sc,le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

In [3]:
base_dropna_x = base_dropna.drop(columns='ChargeOff')
base_dropna_y = base_dropna['ChargeOff']

In [5]:
# Train Model
clf = RandomForestClassifier(random_state=1234)

param_grid = {
#     'learning_rate': [0.05,0.1,0.2],
#               'loss': ['exponential'],
#               'max_depth':[2,4,6,8,12],
#               'max_features':[None],
#               'n_estimators':[100,200,300],
#               'min_samples_split':[2],
#               "base_estimator__min_samples_split":[2,3,4],
#               "base_estimator__criterion":["gini", "entropy"],
#               "base_estimator__max_depth":[2,3,4,5]
             }

model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=10,n_jobs=-1,verbose=10)
model.fit(base_dropna_x, base_dropna_y)


print("Best Accuracy :",model.best_score_)
print("Best Parameters",model.best_params_)
print(model.cv_results_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('results/rf/y_pred1.csv',header=['ChargeOff'],index_label="Id")

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   10.2s remaining:   23.8s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.2s remaining:   10.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   10.3s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.8s finished


Best Accuracy : 0.9022447589998622
Best Parameters {}
{'mean_fit_time': array([10.02176611]), 'std_fit_time': array([0.09323126]), 'mean_score_time': array([0.10208473]), 'std_score_time': array([0.01522704]), 'params': [{}], 'split0_test_score': array([0.90082313]), 'split1_test_score': array([0.90303152]), 'split2_test_score': array([0.89560329]), 'split3_test_score': array([0.89981931]), 'split4_test_score': array([0.90142542]), 'split5_test_score': array([0.8988155]), 'split6_test_score': array([0.90945593]), 'split7_test_score': array([0.90202771]), 'split8_test_score': array([0.90421687]), 'split9_test_score': array([0.90722892]), 'mean_test_score': array([0.90224476]), 'std_test_score': array([0.00381893]), 'rank_test_score': array([1], dtype=int32)}


In [7]:
# Train Model
clf = RandomForestClassifier(random_state=1234)

param_grid = {
#     'learning_rate': [0.05,0.1,0.2],
#               'loss': ['exponential'],
              'max_depth':[2,4,6,8,12],
              'n_estimators':[100,200,300],
              'min_samples_split':[2],
#               "base_estimator__min_samples_split":[2,3,4],
#               "base_estimator__criterion":["gini", "entropy"],
#               "base_estimator__max_depth":[2,3,4,5]
             }

model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=10,n_jobs=-1,verbose=10)
model.fit(base_dropna_x, base_dropna_y)


print("Best Accuracy :",model.best_score_)
print("Best Parameters",model.best_params_)
print(model.cv_results_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('results/rf/y_pred2.csv',header=['ChargeOff'],index_label="Id")

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 143 out of 150 | elapsed:  1.9min remaining:    5.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.0min finished


Best Accuracy : 0.8906603728707239
Best Parameters {'max_depth': 12, 'min_samples_split': 2, 'n_estimators': 200}
{'mean_fit_time': array([ 1.88846743,  3.68350213,  5.49014349,  3.09526651,  6.10848911,
        9.19571376,  4.35539672,  8.6811362 , 12.95212588,  5.56765823,
       11.1856061 , 16.81616201,  7.86299601, 15.59076922, 20.94420762]), 'std_fit_time': array([0.01073888, 0.0260579 , 0.02617536, 0.02500397, 0.01725443,
       0.02255412, 0.03014543, 0.02796782, 0.0649214 , 0.01949433,
       0.04253022, 0.05325811, 0.02594382, 0.04318491, 0.86616855]), 'mean_score_time': array([0.03805962, 0.07497137, 0.10591438, 0.04609506, 0.08486137,
       0.12736497, 0.05089397, 0.10255318, 0.15389981, 0.06054749,
       0.11961012, 0.17846229, 0.08712935, 0.1633512 , 0.17302759]), 'std_score_time': array([0.00197466, 0.00227691, 0.00237838, 0.00378197, 0.00172447,
       0.00266806, 0.00189545, 0.00427772, 0.0062377 , 0.00076395,
       0.00351582, 0.00524032, 0.00602625, 0.00235188, 0.

In [8]:
# Train Model
clf = RandomForestClassifier(random_state=1234)

param_grid = {
#     'learning_rate': [0.05,0.1,0.2],
#               'loss': ['exponential'],
              'max_depth':[8,12],
              'n_estimators':[200],
              'min_samples_split':[2,3,4,5,6],
#               "base_estimator__min_samples_split":[2,3,4],
#               "base_estimator__criterion":["gini", "entropy"],
             }

model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=10,n_jobs=-1,verbose=10)
model.fit(base_dropna_x, base_dropna_y)


print("Best Accuracy :",model.best_score_)
print("Best Parameters",model.best_params_)
print(model.cv_results_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('results/rf/y_pred3.csv',header=['ChargeOff'],index_label="Id")

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   45.3s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 198 out of 200 | elapsed:  

Best Accuracy : 0.8918449384770563
Best Parameters {'criterion': 'entropy', 'max_depth': 12, 'min_samples_split': 4, 'n_estimators': 200}
{'mean_fit_time': array([11.08282006, 11.10494573, 11.4078202 , 11.42165523, 11.13847854,
       15.55803359, 15.66525619, 15.67773099, 15.97225955, 16.30375421,
       13.35672383, 13.27889488, 13.19331312, 12.9194659 , 13.07569587,
       18.70015049, 18.41342402, 17.97326682, 18.03508646, 15.77436032]), 'std_fit_time': array([0.05409745, 0.04770592, 0.26654368, 0.2109608 , 0.0412605 ,
       0.03137988, 0.07717905, 0.05504208, 0.30901293, 0.11034367,
       0.11163363, 0.05597797, 0.03584221, 0.09197482, 0.20322454,
       0.08824106, 0.24652355, 0.0816763 , 0.1001311 , 1.45764468]), 'mean_score_time': array([0.11580634, 0.11503289, 0.11885331, 0.12261505, 0.11682463,
       0.16396856, 0.17406642, 0.16913674, 0.17202203, 0.17015524,
       0.12085409, 0.13107765, 0.12121983, 0.12553506, 0.12423136,
       0.17067351, 0.16677403, 0.16019061, 0.162

In [12]:
# Train Model
clf = RandomForestClassifier(random_state=1234)

param_grid = {
#               'loss': ['exponential'],
              'max_depth':[12,20],
              'n_estimators':[200,300,600],
              'min_samples_split':[4],
#               "base_estimator__min_samples_split":[2,3,4],
              "criterion":["entropy"],
             }

model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=10,n_jobs=-1,verbose=10)
model.fit(base_dropna_x, base_dropna_y)


print("Best Accuracy :",model.best_score_)
print("Best Parameters",model.best_params_)
print(model.cv_results_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('results/rf/y_pred4.csv',header=['ChargeOff'],index_label="Id")

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  44 out of  60 | elapsed:  2.2min remaining:   48.4s
[Parallel(n_jobs=-1)]: Done  51 out of  60 | elapsed:  3.0min remaining:   31.4s
[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed:  3.2min remaining:    6.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.2min finished


Best Accuracy : 0.9025057991451855
Best Parameters {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 600}
{'mean_fit_time': array([17.59815488, 26.42091408, 52.93964019, 22.54674053, 33.93709288,
       60.28432391]), 'std_fit_time': array([0.0365944 , 0.06720157, 0.11752457, 0.22125107, 0.09024865,
       1.50856873]), 'mean_score_time': array([0.15938849, 0.23875387, 0.46894214, 0.21366482, 0.31031485,
       0.48390715]), 'std_score_time': array([0.00275553, 0.00468935, 0.00574189, 0.00482147, 0.00420601,
       0.05668539]), 'param_criterion': masked_array(data=['entropy', 'entropy', 'entropy', 'entropy', 'entropy',
                   'entropy'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[12, 12, 12, 20, 20, 20],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_samples_spl

In [13]:
# Train Model
clf = RandomForestClassifier(random_state=1234)

param_grid = {
#               'loss': ['exponential'],
              'max_depth':[20],
              'n_estimators':[600,1000,2000],
              'min_samples_split':[4],
#               "base_estimator__min_samples_split":[2,3,4],
              "criterion":["entropy"],
             }

model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=10,n_jobs=-1,verbose=10)
model.fit(base_dropna_x, base_dropna_y)


print("Best Accuracy :",model.best_score_)
print("Best Parameters",model.best_params_)
print(model.cv_results_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('results/rf/y_pred5.csv',header=['ChargeOff'],index_label="Id")

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed:  1.9min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  3.0min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:  3.0min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  5.3min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  6.1min remaining:   40.7s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.1min finished


Best Accuracy : 0.9027668030080571
Best Parameters {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 2000}
{'mean_fit_time': array([ 66.76712191, 111.86471729, 194.6479116 ]), 'std_fit_time': array([ 0.15399116,  0.20256414, 13.0831299 ]), 'mean_score_time': array([0.63150406, 1.03912449, 1.68725173]), 'std_score_time': array([0.00784185, 0.01751627, 0.10265465]), 'param_criterion': masked_array(data=['entropy', 'entropy', 'entropy'],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[20, 20, 20],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'param_min_samples_split': masked_array(data=[4, 4, 4],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[600, 1000, 2000],
             mask=[False, False, False],
       fill_value='?',
            dtype=object),