# CS 5228


In [2]:
# All Imports
from utils import *
import pandas as pd
import locale
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals.joblib import parallel_backend

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
locale.setlocale(locale.LC_ALL,'')
pd.set_option('display.max_columns', None)



In [3]:
drop_columns = []
le = generate_labels()
scaler = generate_scaler(le, preprocessing.MinMaxScaler())

base_dropna = get_data(scaler=scaler,le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_test = get_data(scaler=scaler,le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

In [6]:
# Best learning_rate: 0.15
# Best loss: exponential 0.9327217565893049
# Best max_depth: 9 0.9330429973826251
# Best max_features: None 0.9330028488338005
# Best n_estimators: 680 0.9333240593947796
# Best min_samples_split: 5 0.933544926800549
# Best subsample: 0.9 0.9340468582339625
# Best min_samples_leaf: 2
# Best random_state: 41 0.9353720120393237
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.15],
              'loss': ['exponential'],
              'max_depth':[9],
              'max_features':[None],
              'n_estimators': [680],
              'min_samples_split': [5],
              'subsample': [0.9],
              'min_samples_leaf': [2],
              'random_state': 41}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', verbose=2, n_jobs=-1)
with parallel_backend('threading'):
    model.fit(base_dropna.drop(columns='ChargeOff'), base_dropna['ChargeOff'])

# Validation
print("Best Accuracy :",model.best_score_)
print("Best Parameter: ", model.best_params_)

# Prediction
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9 
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9 
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9 [CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9 
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9 
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9, total= 6.2min
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9 
[CV]  learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9, total= 6.2min
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9 
[CV]  learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=13579, subsample=0.9, total= 6.2min
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_spl

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 24.8min


[CV]  learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=731, subsample=0.9, total= 6.3min
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=2333, subsample=0.9 
[CV]  learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=731, subsample=0.9, total= 6.3min
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=2333, subsample=0.9 
[CV]  learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=680, random_state=731, subsample=0.9, total= 6.3min
[CV] learning_rate=0.15, loss=exponential, max_depth=9, max_features=None, min_samples_leaf=2, min_samples_split=5, n_

[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 31.0min finished


Best Accuracy : 0.9353720120393237
Best Parameter:  {'learning_rate': 0.15, 'loss': 'exponential', 'max_depth': 9, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 680, 'random_state': 41, 'subsample': 0.9}
