# Gradient Boosting Machine

In [1]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# Dataframe
path_df = "Pickles/df.pickle"
with open(path_df, 'rb') as data:
    df = pickle.load(data)

# features_train
path_features_train = "Pickles/features_train.pickle"
with open(path_features_train, 'rb') as data:
    features_train = pickle.load(data)

# labels_train
path_labels_train = "Pickles/labels_train.pickle"
with open(path_labels_train, 'rb') as data:
    labels_train = pickle.load(data)

# features_test
path_features_test = "Pickles/features_test.pickle"
with open(path_features_test, 'rb') as data:
    features_test = pickle.load(data)

# labels_test
path_labels_test = "Pickles/labels_test.pickle"
with open(path_labels_test, 'rb') as data:
    labels_test = pickle.load(data)

In [3]:
print(features_train.shape)
print(features_test.shape)

(2041, 300)
(876, 300)


## Cross-Validation for Hyperparameter tuning

In [4]:
gb_0 = GradientBoostingClassifier(random_state = 10)

print('Parameters currently in use:\n')
pprint(gb_0.get_params())

Parameters currently in use:

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 10,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


### Randomized Search Cross Validation

In [5]:
# n_estimators
n_estimators = [200, 800]

# max_features
max_features = ['auto', 'sqrt']

# max_depth
max_depth = [10, 40]
max_depth.append(None)

# min_samples_split
min_samples_split = [10, 30, 50]

# min_samples_leaf
min_samples_leaf = [1, 2, 4]

# learning rate
learning_rate = [.1, .5]

# subsample
subsample = [.5, 1.]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate,
               'subsample': subsample}

pprint(random_grid)

{'learning_rate': [0.1, 0.5],
 'max_depth': [10, 40, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [10, 30, 50],
 'n_estimators': [200, 800],
 'subsample': [0.5, 1.0]}


Then, we'll perform the Random Search:

In [6]:
# First create the base model to tune
gbc = GradientBoostingClassifier(random_state=10)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=gbc,
                                   param_distributions=random_grid,
                                   n_iter=5,
                                   scoring='accuracy',
                                   cv=2, 
                                   verbose=1, 
                                   random_state=10)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


RandomizedSearchCV(cv=2, estimator=GradientBoostingClassifier(random_state=10),
                   n_iter=5,
                   param_distributions={'learning_rate': [0.1, 0.5],
                                        'max_depth': [10, 40, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [10, 30, 50],
                                        'n_estimators': [200, 800],
                                        'subsample': [0.5, 1.0]},
                   random_state=10, scoring='accuracy', verbose=1)

In [7]:
print(random_search.best_params_)
print("")
print(random_search.best_score_)

{'subsample': 0.5, 'n_estimators': 200, 'min_samples_split': 50, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'learning_rate': 0.1}

0.6001992471817326


### Grid Search Cross Validation

In [8]:
# Create the parameter grid based on the results of random search 
max_depth = [35, 40, 45]
max_features = ['auto']
min_samples_leaf = [1]
min_samples_split = [20, 40]
n_estimators = [800]
learning_rate = [.05, .15]
subsample = [.5]

param_grid = {
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators,
    'learning_rate': learning_rate,
    'subsample': subsample

}

# Create a base model
gbc = GradientBoostingClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 1, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=gbc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 1 folds for each of 12 candidates, totalling 12 fits


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.33, train_size=None),
             estimator=GradientBoostingClassifier(random_state=8),
             param_grid={'learning_rate': [0.05, 0.15],
                         'max_depth': [35, 40, 45], 'max_features': ['auto'],
                         'min_samples_leaf': [1], 'min_samples_split': [20, 40],
                         'n_estimators': [800], 'subsample': [0.5]},
             scoring='accuracy', verbose=1)

In [9]:
print(grid_search.best_params_)
print("")
print(grid_search.best_score_)

{'learning_rate': 0.05, 'max_depth': 35, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 40, 'n_estimators': 800, 'subsample': 0.5}

0.5890207715133531


In [10]:
best_gbc = grid_search.best_estimator_

In [11]:
best_gbc

GradientBoostingClassifier(learning_rate=0.05, max_depth=35,
                           max_features='auto', min_samples_split=40,
                           n_estimators=800, random_state=8, subsample=0.5)

## Model fit and performance

In [12]:
best_gbc.fit(features_train, labels_train)

GradientBoostingClassifier(learning_rate=0.05, max_depth=35,
                           max_features='auto', min_samples_split=40,
                           n_estimators=800, random_state=8, subsample=0.5)

In [13]:
gbc_pred = best_gbc.predict(features_test)

#### Training accuracy

In [14]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_gbc.predict(features_train)))

The training accuracy is: 
0.9965703086722195


#### Test accuracy

In [15]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, gbc_pred))

The test accuracy is: 
0.615296803652968


#### Classification report

In [16]:
# Classification report
print("Classification report")
print(classification_report(labels_test,gbc_pred))

Classification report
              precision    recall  f1-score   support

           0       0.62      0.54      0.58       291
           1       0.61      0.72      0.66       289
           2       0.62      0.58      0.60       296

    accuracy                           0.62       876
   macro avg       0.62      0.62      0.61       876
weighted avg       0.62      0.62      0.61       876



#### Confusion matrix

In [17]:
base_model = GradientBoostingClassifier(random_state = 8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

0.6198630136986302

In [18]:
best_gbc.fit(features_train, labels_train)
accuracy_score(labels_test, best_gbc.predict(features_test))

0.615296803652968

We'll create a dataset with a model summary to compare models:

In [19]:
d = {
     'Model': 'Gradient Boosting',
     'Training Set Accuracy': accuracy_score(labels_train, best_gbc.predict(features_train)),
     'Test Set Accuracy': accuracy_score(labels_test, gbc_pred)
}

df_models_gbc = pd.DataFrame(d, index=[0])

In [20]:
df_models_gbc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Gradient Boosting,0.99657,0.615297


Let's save the model and this dataset:

In [21]:
with open('Models/best_gbc.pickle', 'wb') as output:
    pickle.dump(best_gbc, output)
    
with open('Models/df_models_gbc.pickle', 'wb') as output:
    pickle.dump(df_models_gbc, output)