# CS5228 - Team pandas - Gradient Boosting Fine Tuning

A0105650R - Wang Gejing 

A0198889R - Chen Ningshuang 

A0210996X - Zhang Hao 

## Import Libraries

In [4]:
# All Imports
from utils import *
import pandas as pd
import locale
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals.joblib import parallel_backend

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
locale.setlocale(locale.LC_ALL,'')
pd.set_option('display.max_columns', None)

## Load training and testing data

In [5]:
drop_columns = []
le = generate_labels()
scaler = generate_scaler(le, preprocessing.MinMaxScaler())

base_dropna = get_data(scaler=scaler,le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_test = get_data(scaler=scaler,le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

## Fine tune Gradient Boostiing Classifier - Generate 9 Outputs for Voting

In [6]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [41]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_1.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.6min remaining:  6.9min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9, total= 4.8min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9, total= 4.8min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=41, subsample=0.9, total= 4.8min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.8min finished


Best Accuracy : 0.9343078459717049
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 41, 'subsample': 0.9}


In [7]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [57]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :", model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_2.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9 [CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9 



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9, total= 4.7min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9, total= 4.7min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.7min remaining:  7.0min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9, total= 4.7min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9, total= 4.7min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=57, subsample=0.9, total= 4.7min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min finished


Best Accuracy : 0.9344885265343514
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 57, 'subsample': 0.9}


In [8]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [731]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_3.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9, total= 4.5min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9, total= 4.5min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.5min remaining:  6.8min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9, total= 4.5min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=731, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min finished


Best Accuracy : 0.9338862277598556
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 731, 'subsample': 0.9}


In [9]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [1234]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_4.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.6min remaining:  6.9min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1234, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min finished


Best Accuracy : 0.9340869725194672
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 1234, 'subsample': 0.9}


In [10]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [2333]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_5.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.6min remaining:  6.9min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=2333, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min finished


Best Accuracy : 0.9339464284127088
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 2333, 'subsample': 0.9}


In [11]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [63521]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_6.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9, total= 4.5min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.6min remaining:  6.9min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9, total= 4.7min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9, total= 4.7min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=63521, subsample=0.9, total= 4.7min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min finished


Best Accuracy : 0.9332236900382078
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 63521, 'subsample': 0.9}


In [12]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [1111]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_7.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9 [CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9 

[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9, total= 4.5min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.6min remaining:  6.9min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=1111, subsample=0.9, total= 4.7min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min finished


Best Accuracy : 0.9334043645543861
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 1111, 'subsample': 0.9}


In [13]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [4444]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_8.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9, total= 4.7min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.7min remaining:  7.0min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9, total= 4.7min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9, total= 4.7min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=4444, subsample=0.9, total= 4.7min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min finished


Best Accuracy : 0.9337456816376075
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 4444, 'subsample': 0.9}


In [14]:
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.19],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [820],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': [7777]}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_9.csv',header=['ChargeOff'],index_label="Id")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9 
[CV] learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9 


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9, total= 4.5min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9, total= 4.5min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.5min remaining:  6.8min


[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9, total= 4.6min
[CV]  learning_rate=0.19, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=820, random_state=7777, subsample=0.9, total= 4.6min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min finished


Best Accuracy : 0.9336854164890938
Best Parameter:  {'learning_rate': 0.19, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 820, 'random_state': 7777, 'subsample': 0.9}
