In [108]:

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, RandomizedSearchCV 
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score
import xgboost as xgb
from scipy.stats import uniform, randint

RNG = 42

In [40]:

# Make some synthetic imbalanced binary classification data
X, y = make_classification(n_classes=2, class_sep=2, 
    weights=[0.95, 0.05], 
    n_informative=2, n_redundant=0, n_features=50,
    flip_y=0.05,
    n_clusters_per_class=1, 
    n_samples=15000, random_state=RNG)


In [41]:
X.shape

(15000, 50)

In [42]:
np.unique(y, return_counts=True)

(array([0, 1]), array([13918,  1082]))

In [43]:
metric_names = ['f1', 'roc_auc', 'average_precision', 'accuracy', 'precision', 'recall']
cv = KFold(n_splits=3)
scv = StratifiedKFold(n_splits=3)

In [44]:
# metric_names = ['f1', 'roc_auc', 'average_precision', 'accuracy', 'precision', 'recall']
scores_df = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
clf = GaussianNB()
for metric in metric_names:
    score1 = cross_val_score(clf, X, y, scoring=metric, cv=cv).mean()
    score2 = cross_val_score(clf, X, y, scoring=metric, cv=scv).mean()
    scores_df.loc[metric] = [score1, score2]

In [45]:
scores_df

Unnamed: 0,Random-CV,Stratified-CV
f1,0.785002,0.785101
roc_auc,0.832357,0.83265
average_precision,0.705496,0.705934
accuracy,0.973467,0.973467
precision,0.944575,0.944384
recall,0.671682,0.671909


# Logistic Regression (without assigning higher weights to classes)

In [46]:
scores_df1 = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
class_weights_1 = compute_class_weight(None, np.unique(y), y)

clf1 = LogisticRegression(solver='lbfgs')

for metric in metric_names:
    score1 = cross_val_score(clf1, X, y, scoring=metric, cv=cv).mean()
    score2 = cross_val_score(clf1, X, y, scoring=metric, cv=scv).mean()
    scores_df1.loc[metric] = [score1, score2]

In [47]:
scores_df1

Unnamed: 0,Random-CV,Stratified-CV
f1,0.774393,0.775351
roc_auc,0.835768,0.835858
average_precision,0.703357,0.703825
accuracy,0.972933,0.973
precision,0.969318,0.969476
recall,0.644795,0.646032


In [48]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', np.unique(y), y)
print(class_weights)

[0.53887053 6.93160813]


# Logistic Regression (assigning higher weights to minority class)

In [49]:
scores_df2 = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
clf2 = LogisticRegression(solver='lbfgs', class_weight={0: class_weights[0], 1: class_weights[1]})

for metric in metric_names:
    score1 = cross_val_score(clf2, X, y, scoring=metric, cv=cv).mean()
    score2 = cross_val_score(clf2, X, y, scoring=metric, cv=scv).mean()
    scores_df2.loc[metric] = [score1, score2]

In [50]:
scores_df2

Unnamed: 0,Random-CV,Stratified-CV
f1,0.572225,0.57531
roc_auc,0.838024,0.837889
average_precision,0.704692,0.705569
accuracy,0.926,0.926733
precision,0.491687,0.495588
recall,0.685333,0.686704


In [51]:
# Make some synthetic imbalanced binary classification data
# X1, y1 = make_classification(n_classes=2, class_sep=2, 
#     weights=[0.05, 0.95], 
#     n_informative=2, n_redundant=5, 
#     flip_y=0.05, n_features=50, 
#     n_clusters_per_class=1, 
#     n_samples=1000, random_state=RNG)

# print(np.unique(y1, return_counts=True))

# class_weights1 = compute_class_weight(None, np.unique(y1), y1)
# print(class_weights1)

# Random Forest (without balancing)

In [52]:
scores_df3 = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
clf3 = RandomForestClassifier()

for metric in metric_names:
    score1 = cross_val_score(clf3, X, y, scoring=metric, cv=cv).mean()
    score2 = cross_val_score(clf3, X, y, scoring=metric, cv=scv).mean()
    scores_df3.loc[metric] = [score1, score2]

In [53]:
scores_df3

Unnamed: 0,Random-CV,Stratified-CV
f1,0.790297,0.789489
roc_auc,0.839251,0.837363
average_precision,0.695396,0.699981
accuracy,0.9744,0.974533
precision,0.969071,0.967966
recall,0.667896,0.66729


# Random Forest (with balancing)

In [54]:
scores_df4 = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
clf4 = RandomForestClassifier(class_weight='balanced', oob_score = True)

for metric in metric_names:
    score1 = cross_val_score(clf4, X, y, scoring=metric, cv=cv).mean()
    score2 = cross_val_score(clf4, X, y, scoring=metric, cv=scv).mean()
    scores_df4.loc[metric] = [score1, score2]

In [55]:
scores_df4

Unnamed: 0,Random-CV,Stratified-CV
f1,0.791165,0.791006
roc_auc,0.833493,0.834847
average_precision,0.70161,0.695482
accuracy,0.974467,0.974533
precision,0.967701,0.967833
recall,0.667896,0.668213


# Gradient Boosting Classifier  (without balancing)

In [61]:
scores_df5 = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
clf5 = GradientBoostingClassifier()

for metric in metric_names:
    score1 = cross_val_score(clf5, X, y, scoring=metric, cv=cv).mean()
    score2 = cross_val_score(clf5, X, y, scoring=metric, cv=scv).mean()
    scores_df5.loc[metric] = [score1, score2]

In [62]:
scores_df5

Unnamed: 0,Random-CV,Stratified-CV
f1,0.765803,0.765321
roc_auc,0.836325,0.830744
average_precision,0.701678,0.702769
accuracy,0.970533,0.970667
precision,0.884769,0.898505
recall,0.66981,0.669139


# Gradient Boosting Classifier  (balancing)

In [78]:
scores_df6 = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
clf6 = GradientBoostingClassifier()
# we cannot assign class weight to gradient boosting classifier, but we 
# can assign weight to each observation (pass weights to fit method)

sample_weights = np.zeros(len(y))
# print(sample_weights[30:40])
# print(y)
sample_weights[y == 0] = class_weights[0]
sample_weights[y == 1] = class_weights[1]

for metric in metric_names:
    score1 = cross_val_score(clf6, X, y, scoring=metric, cv=cv, 
                                     fit_params  = {'sample_weight': sample_weights}).mean()
    score2 = cross_val_score(clf6, X, y, scoring=metric, cv=scv, 
                                     fit_params = {'sample_weight': sample_weights}).mean()
    scores_df6.loc[metric] = [score1, score2]

In [80]:
scores_df6

Unnamed: 0,Random-CV,Stratified-CV
f1,0.782308,0.779516
roc_auc,0.828563,0.826315
average_precision,0.700718,0.698258
accuracy,0.973067,0.972733
precision,0.944219,0.933098
recall,0.667996,0.668213


# XGBoost

XGBoost stands for eXtreme Gradient Boosting.

Note **we will use `xgboost` library to implement this.**

## Why Use XGBoost?
The two reasons to use XGBoost are also the two goals of the project:

1. Execution Speed.
2. Model Performance.

For more library details: https://xgboost.readthedocs.io/en/latest/python/python_intro.html#setting-parameters

https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn

# In order for XGBoost to be able to use our data, we’ll need to transform it into a specific format that XGBoost can handle. That format is called DMatrix

In [113]:
# train, test = scv.split(X, y)

def train_predict_xgboost(param):
    
    f1_scores = []
    accuracy_scores = []
    roc_auc_scores = []
    precision_scores = []
    recall_scores = []

    steps = 20  # The number of training iterations

    for train_index, test_index in scv.split(X, y): 
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                                        param=param, random_state=42)
        xgb_model.fit(X_train, y_train)

        y_pred = xgb_model.predict(X_test)

        f1_scores.append(f1_score(y_test, y_pred))
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        roc_auc_scores.append(roc_auc_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))

    print("f1 ", np.mean(f1_scores))
    print("roc auc ", np.mean(roc_auc_scores))
    print("accuracy ", np.mean(accuracy_scores))
    print("precision ", np.mean(precision_scores))
    print("recall ", np.mean(recall_scores))

param = {
'eta': 0.3, 
'max_depth': 10,  
'objective': "binary:logistic",  
'num_class': 2} 

train_predict_xgboost(param)

f1  0.7892873523678477
roc auc  0.8331366812401578
accuracy  0.9742666666666667
precision  0.9639972978723241
recall  0.6682132963988922


# XGboost hyper-parameter searching

In [110]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="auc")

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X, y)
report_best_scores(search.cv_results_, 1)


Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed: 16.8min finished


Model with rank: 1
Mean validation score: 0.975 (std: 0.001)
Parameters: {'colsample_bytree': 0.8835558684167137, 'gamma': 0.06974693032602092, 'learning_rate': 0.11764339456056544, 'max_depth': 5, 'n_estimators': 114, 'subsample': 0.7824279936868144}

Model with rank: 1
Mean validation score: 0.975 (std: 0.001)
Parameters: {'colsample_bytree': 0.9040922615763338, 'gamma': 0.2252496259847715, 'learning_rate': 0.033979488347959955, 'max_depth': 2, 'n_estimators': 113, 'subsample': 0.9233589392465844}

Model with rank: 1
Mean validation score: 0.975 (std: 0.001)
Parameters: {'colsample_bytree': 0.9486212527455787, 'gamma': 0.17837666334679464, 'learning_rate': 0.11428035290621423, 'max_depth': 5, 'n_estimators': 144, 'subsample': 0.6563696899899051}

Model with rank: 1
Mean validation score: 0.975 (std: 0.001)
Parameters: {'colsample_bytree': 0.722213395520227, 'gamma': 0.1792328642721363, 'learning_rate': 0.06476071785753891, 'max_depth': 4, 'n_estimators': 108, 'subsample': 0.849319250

In [112]:
# op
params = {'colsample_bytree': 0.7101837935757396, 
             'gamma': 0.34871013362341996, 
             'learning_rate': 0.11920470221176523, 
                 'max_depth': 5, 
             'n_estimators': 107, 
             'subsample': 0.9884232980661471}

train_predict_xgboost(params)

f1  0.7892873523678477
roc auc  0.8331366812401578
accuracy  0.9742666666666667
precision  0.9639972978723241
recall  0.6682132963988922


In [115]:
xgb.XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=False, verbosity=None)