In [None]:
## Tuning GBM 
## https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from scipy import stats
import itertools
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier



In [2]:
file_location='C:\\Users\\BOL1KOR\\Desktop\\Pickle_Files_New'
os.chdir('C:\\Users\\BOL1KOR\\Desktop\\Pickle_Files_New') 
dfull = pickle.load(open('dfull_4001_4021.pkl', 'rb'))

In [3]:
dfull['class2'] = dfull['class']
dfull['class2'] = [0 if x == 'wake' else 1 for x in dfull['class2']]

In [4]:
## convert selected columns to categories 
dfull[['class','class2']] = dfull[['class','class2']].astype('str') 
dfull[['class','class2']] = dfull[['class','class2']].astype('category')

In [59]:
### create a smaller subset of data for testing algo
dw = dfull[(dfull['class'] == 'wake')]
d1 = dfull[(dfull['class'] == 'sleep_stage_1')]
d2 = dfull[(dfull['class'] == 'sleep_stage_2')]

# https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
# Randomly sample n elements from your dataframe
d1_elements = d1
d2_elements = d2.sample(n = d1.shape[0], random_state = 42)
dw_elements = dw.sample(n = 2 * d1.shape[0], random_state = 42)

dn = pd.DataFrame()
dn = pd.concat([dw_elements, d1_elements, d2_elements],ignore_index=True)
dn.shape[0]

11216

In [60]:
## create train-test data : 2 class
X = dn[['delta','theta','alpha','beta','gamma']]  # dfull[['delta','theta','alpha','beta','gamma']]
y = dn[['class2']]  # dfull[['class']] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



## normalize the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape

(8972, 5)

In [None]:
## Tuning using GridSearchCV 

In [13]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score,precision_score

In [14]:

skf = StratifiedKFold(n_splits =10)

scorers = { 
     'precision_score': make_scorer(precision_score), 
     'recall_score': make_scorer(recall_score), 
     'accuracy_score': make_scorer(accuracy_score)
 } 

results = []
refit_param = ['precision_score', 'recall_score']

In [15]:
## learning rate

In [16]:
param_grid_1 = {'learning_rate' : [1,0.5,0.25,0.125,0.1,0.05,0.01,0.005,0.001]}

In [17]:
clf = GradientBoostingClassifier()
for refit_score in refit_param:
    grid_search = GridSearchCV(clf,param_grid_1, scoring=scorers, refit = refit_score,cv = skf, n_jobs = -1, return_train_score = True)
    grid_search.fit(X_train_scaled, y_train)
    y_pred_grid = grid_search.predict(X_test_scaled)
    results.append([refit_score,grid_search.best_params_])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [18]:
results

[['precision_score', {'learning_rate': 0.125}],
 ['recall_score', {'learning_rate': 0.5}]]

In [19]:
## n_estimators tuning

In [20]:
param_grid_2 = {
    'n_estimators':[1, 2, 4, 8, 16, 32, 64, 100, 200]
}


In [21]:
clf = GradientBoostingClassifier(learning_rate = 0.125)
refit_score = 'precision_score'
grid_search = GridSearchCV(clf,param_grid_2, scoring=scorers, refit = refit_score,cv = skf, n_jobs = -1, return_train_score = True)
grid_search.fit(X_train_scaled, y_train)
y_pred_grid = grid_search.predict(X_test_scaled)
results[0].append(grid_search.best_params_)

  y = column_or_1d(y, warn=True)


In [22]:
clf = GradientBoostingClassifier(learning_rate = 0.5)
refit_score = 'recall_score'
grid_search = GridSearchCV(clf,param_grid_2, scoring=scorers, refit = refit_score,cv = skf, n_jobs = -1, return_train_score = True)
grid_search.fit(X_train_scaled, y_train)
y_pred_grid = grid_search.predict(X_test_scaled)
results[1].append(grid_search.best_params_)

  y = column_or_1d(y, warn=True)


In [23]:
results

[['precision_score', {'learning_rate': 0.125}, {'n_estimators': 100}],
 ['recall_score', {'learning_rate': 0.5}, {'n_estimators': 100}]]

In [None]:
## max_depth, min_samples_split, min_samples_leaf, max_features

In [26]:
param_grid_4 = {
    'max_depth' : np.linspace(1,16,16, endpoint = True)
}


In [27]:
clf = GradientBoostingClassifier(learning_rate = 0.125,n_estimators=100)
refit_score = 'precision_score'
grid_search = GridSearchCV(clf,param_grid_4, scoring=scorers, refit = refit_score,cv = skf, n_jobs = -1, return_train_score = True)
grid_search.fit(X_train_scaled, y_train)
y_pred_grid = grid_search.predict(X_test_scaled)
results[0].append(grid_search.best_params_)

  y = column_or_1d(y, warn=True)


In [36]:
clf = GradientBoostingClassifier(learning_rate = 0.5,n_estimators=100)
refit_score = 'recall_score'
grid_search = GridSearchCV(clf,param_grid_4, scoring=scorers, refit = refit_score,cv = skf, n_jobs = -1, return_train_score = True)
grid_search.fit(X_train_scaled, y_train)
y_pred_grid = grid_search.predict(X_test_scaled)
results[1].append(grid_search.best_params_)

  y = column_or_1d(y, warn=True)


In [38]:
print(grid_search.best_params_)

{'max_depth': 11.0}


In [29]:
results

[['precision_score',
  {'learning_rate': 0.125},
  {'n_estimators': 100},
  {'max_depth': 2.0}],
 ['recall_score',
  {'learning_rate': 0.5},
  {'n_estimators': 100},
  {'max_depth': 11.0}]]

In [39]:
param_grid_5 ={
    'min_samples_split':[2,4,6,8,10,20,40,60,100], 
    'min_samples_leaf':[1,3,5,7,9]
}

In [40]:
clf = GradientBoostingClassifier(learning_rate = 0.125,n_estimators=100,max_depth = 2)
refit_score = 'precision_score'
grid_search = GridSearchCV(clf,param_grid_5, scoring=scorers, refit = refit_score,cv = skf, n_jobs = -1, return_train_score = True)
grid_search.fit(X_train_scaled, y_train)
y_pred_grid = grid_search.predict(X_test_scaled)
results[0].append(grid_search.best_params_)

  y = column_or_1d(y, warn=True)


In [41]:
clf = GradientBoostingClassifier(learning_rate = 0.5,n_estimators=100,max_depth = 11)
refit_score = 'recall_score'
grid_search = GridSearchCV(clf,param_grid_5, scoring=scorers, refit = refit_score,cv = skf, n_jobs = -1, return_train_score = True)
grid_search.fit(X_train_scaled, y_train)
y_pred_grid = grid_search.predict(X_test_scaled)
results[1].append(grid_search.best_params_)

  y = column_or_1d(y, warn=True)


In [42]:
results

[['precision_score',
  {'learning_rate': 0.125},
  {'n_estimators': 100},
  {'max_depth': 2.0},
  {'min_samples_leaf': 7, 'min_samples_split': 60}],
 ['recall_score',
  {'learning_rate': 0.5},
  {'n_estimators': 100},
  {'max_depth': 11.0},
  {'min_samples_leaf': 9, 'min_samples_split': 100}]]

In [None]:
## Train GradientBoostingModel with optimized Parameter

In [64]:
model_gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, 
                                      max_depth = 11, min_samples_leaf = 9, min_samples_split = 100)
model_gb.fit(X_train_scaled,y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=11,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=9, min_samples_split=100,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [65]:
y_pred = model_gb.predict(X_test_scaled)

In [66]:
class_names = ['0','1']
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=class_names))

             precision    recall  f1-score   support

          0       0.84      0.83      0.83      1146
          1       0.82      0.83      0.83      1098

avg / total       0.83      0.83      0.83      2244

