In [1]:
import warnings
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from classification_model.config import numerical_features, categorical_features,\
                                        date_features, features, new_numerical_features
from classification_model.evaluation import generate_report, confusion_matrix,\
                                            calculate_metrics, metrics_summary
from classification_model.custom_pipeline import CalculateAntiquity, NumberChannels, ConvertDtypes,\
                                                 ColumnSelector, GetDummies, GetDataFrame

warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('data/train_data.csv', sep=';')
y_train = pd.read_csv('data/train_label.csv', sep=';')
test_data = pd.read_csv('data/test_data.csv', sep=';')
y_test = pd.read_csv('data/test_label.csv', sep=';')

In [3]:
y_train = y_train.pop('event')
y_test = y_test.pop('event')

In [4]:
general_transformations = Pipeline([('dtypes', ConvertDtypes(numerical=numerical_features,
                                                             categorical=categorical_features,
                                                             date=date_features)),
                                    ('number_channels', NumberChannels(columns=['mobile', 'web', 'social', 'email'])),
                                    ('antiquity', CalculateAntiquity(column=date_features[0]))])

numerical_transformations = Pipeline([('selector', ColumnSelector(columns=new_numerical_features)),
                                      ('scaler', StandardScaler()),
                                      ('df', GetDataFrame(columns=new_numerical_features))])

categorical_transformations = Pipeline([('selector', ColumnSelector(columns=categorical_features)),
                                        ('ohe', GetDummies(columns=categorical_features))])

preprocessor = Pipeline([('transformations', general_transformations),
                         ('features', FeatureUnion([
                             ('numerical', numerical_transformations),
                             ('categorical', categorical_transformations)
                         ])),
                         ('df', GetDataFrame(columns=features))])

In [5]:
lr = Pipeline([('transformer', preprocessor),
               ('estimator', LogisticRegression(class_weight='balanced',
                                                random_state=42))])
rf = Pipeline([('transformer', preprocessor),
               ('estimator', RandomForestClassifier(max_depth=4,
                                                    oob_score=True,
                                                    random_state=42))])
gb = Pipeline([('transformer', preprocessor),
               ('estimator', GradientBoostingClassifier(learning_rate=0.08,
                                                        subsample=0.8,
                                                        max_depth=4,
                                                        random_state=42))])
xgb = Pipeline([('transformer', preprocessor),
                ('estimator', XGBClassifier(max_depth=4,
                                            learning_rate=0.08,
                                            subsample=0.8,
                                            random_state=42))])

In [6]:
roc = {}
recall = {}
precision = {}
f_score = {}

roc['lr'] = cross_val_score(estimator=lr, X=train_data, y=y_train, scoring='roc_auc', cv=10).tolist()
roc['rf'] = cross_val_score(estimator=rf, X=train_data, y=y_train, scoring='roc_auc', cv=10).tolist()
roc['gb'] = cross_val_score(estimator=gb, X=train_data, y=y_train, scoring='roc_auc', cv=10).tolist()
roc['xgb'] = cross_val_score(estimator=xgb, X=train_data, y=y_train, scoring='roc_auc', cv=10).tolist()
recall['lr'] = cross_val_score(estimator=lr, X=train_data, y=y_train, scoring='recall', cv=10).tolist()
recall['rf'] = cross_val_score(estimator=rf, X=train_data, y=y_train, scoring='recall', cv=10).tolist()
recall['gb'] = cross_val_score(estimator=gb, X=train_data, y=y_train, scoring='recall', cv=10).tolist()
recall['xgb'] = cross_val_score(estimator=xgb, X=train_data, y=y_train, scoring='recall', cv=10).tolist()
precision['lr'] = cross_val_score(estimator=lr, X=train_data, y=y_train, scoring='precision', cv=10).tolist()
precision['rf'] = cross_val_score(estimator=rf, X=train_data, y=y_train, scoring='precision', cv=10).tolist()
precision['gb'] = cross_val_score(estimator=gb, X=train_data, y=y_train, scoring='precision', cv=10).tolist()
precision['xgb'] = cross_val_score(estimator=xgb, X=train_data, y=y_train, scoring='precision', cv=10).tolist()
f_score['lr'] = cross_val_score(estimator=lr, X=train_data, y=y_train, scoring='f1', cv=10).tolist()
f_score['rf'] = cross_val_score(estimator=rf, X=train_data, y=y_train, scoring='f1', cv=10).tolist()
f_score['gb'] = cross_val_score(estimator=gb, X=train_data, y=y_train, scoring='f1', cv=10).tolist()
f_score['xgb'] = cross_val_score(estimator=xgb, X=train_data, y=y_train, scoring='f1', cv=10).tolist()

In [7]:
roc_mean = {}
for key in roc.keys():
    roc_mean[key] = [np.mean(roc[key])]
precision_mean = {}
for key in roc.keys():
    precision_mean[key] = [np.mean(precision[key])]
recall_mean = {}
for key in roc.keys():
    recall_mean[key] = [np.mean(recall[key])]
f1_mean = {}
for key in roc.keys():
    f1_mean[key] = [np.mean(f_score[key])]

roc_mean = pd.DataFrame.from_dict(roc_mean).T.reset_index(drop=True)
roc_mean.columns = ['roc_auc']
precision_mean = pd.DataFrame.from_dict(precision_mean).T.reset_index(drop=True)
precision_mean.columns = ['precision']
recall_mean = pd.DataFrame.from_dict(recall_mean).T.reset_index(drop=True)
recall_mean.columns = ['recall']
f1_mean = pd.DataFrame.from_dict(f1_mean).T.reset_index(drop=True)
f1_mean.columns = ['f1']

mean_df = pd.concat([roc_mean, precision_mean, recall_mean, f1_mean], axis=1)
mean_df.index = ['lr', 'rf', 'gb', 'xgb']
mean_df

Unnamed: 0,roc_auc,precision,recall,f1
lr,0.679487,0.492965,0.769353,0.600884
rf,0.706334,0.655246,0.161867,0.259412
gb,0.769036,0.63995,0.554073,0.593808
xgb,0.76913,0.640458,0.547776,0.590385


In [8]:
roc_std = {}
for key in roc.keys():
    roc_std[key] = [np.std(roc[key])]
precision_std = {}
for key in roc.keys():
    precision_std[key] = [np.std(precision[key])]
recall_std = {}
for key in roc.keys():
    recall_std[key] = [np.std(recall[key])]
f1_std = {}
for key in roc.keys():
    f1_std[key] = [np.std(f_score[key])]

roc_std = pd.DataFrame.from_dict(roc_std).T.reset_index(drop=True)
roc_std.columns = ['roc_auc']
precision_std = pd.DataFrame.from_dict(precision_std).T.reset_index(drop=True)
precision_std.columns = ['precision']
recall_std = pd.DataFrame.from_dict(recall_std).T.reset_index(drop=True)
recall_std.columns = ['recall']
f1_std = pd.DataFrame.from_dict(f1_std).T.reset_index(drop=True)
f1_std.columns = ['f1']

std_df = pd.concat([roc_std, precision_std, recall_std, f1_std], axis=1)
std_df.index = ['lr', 'rf', 'gb', 'xgb']
std_df

Unnamed: 0,roc_auc,precision,recall,f1
lr,0.002813,0.001993,0.008237,0.003113
rf,0.004399,0.015307,0.008486,0.010447
gb,0.004829,0.008244,0.01346,0.00833
xgb,0.004468,0.009064,0.015087,0.010217


## Hyperparameter tuning

In [9]:
data_train = preprocessor.fit_transform(train_data)

In [17]:
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 11, 2),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.9),
    'subsample': hp.quniform('subsample', 0.25, 1.0, 0.25),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.25, 1.0, 0.25)
}

def objective(params):
    params = {'n_estimators': int(params['n_estimators']),
              'max_depth': int(params['max_depth']),
              'learning_rate': params['learning_rate'],
              'subsample': params['subsample'],
              'colsample_bytree': params['colsample_bytree']}
    xgb = XGBClassifier(**params)
    best_score = cross_val_score(xgb, data_train, y_train, scoring='recall',
                                 cv=5, n_jobs=-1).mean()
    loss = 1 - best_score
    return loss

best = fmin(fn=objective, space=space, max_evals=20,
            rstate=np.random.RandomState(42), algo=tpe.suggest)
best

100%|██████████| 20/20 [04:31<00:00, 13.55s/trial, best loss: 0.4253192426243946]


{'colsample_bytree': 0.75,
 'learning_rate': 0.1776985096712328,
 'max_depth': 4.0,
 'n_estimators': 200.0,
 'subsample': 0.75}

In [12]:
hyperparams = {
    'colsample_bytree': 0.75,
    'learning_rate': 0.1776985096712328,
    'max_depth': 4,
    'n_estimators': 200,
    'subsample': 0.75
}

xgb = Pipeline([('transformer', preprocessor),
                ('estimator', XGBClassifier(**hyperparams))]).fit(train_data, y_train)

y_pred = xgb.predict(test_data)
metrics = calculate_metrics(y_test, y_pred)

generate_report(y_test, y_pred)

Unnamed: 0,precision,recall,f1-score,support
0,0.74,0.78,0.76,14958.0
1,0.64,0.58,0.61,9734.0
accuracy,0.7,0.7,0.7,0.7
macro avg,0.69,0.68,0.68,24692.0
weighted avg,0.7,0.7,0.7,24692.0


In [13]:
metrics_summary(metrics)
confusion_matrix(y_test, y_pred)



The accuracy is: 0.7
The precision is: 0.64
The recall is: 0.58
The F1 score is: 0.61


Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.78,0.22
1,0.42,0.58
