In [1]:
%pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -U threadpoolctl


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import time

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For resampling
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, ADASYN, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.utils import resample

# Ensemble Classifiers
from sklearn.ensemble import VotingClassifier, StackingClassifier

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, log_loss, classification_report, confusion_matrix

# from xgboost import plot_importance, to_graphviz

# from google.colab import files

In [4]:
# from google.colab import auth, drive
# from google.auth import default

In [5]:
# drive.mount('/content/drive',force_remount=True)

# Functions

In [114]:
def print_classification_metrics(y_true, y_pred):
    # Compute the accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print("Accuracy: {:.3f}".format(accuracy))
    # Compute the precision
    precision = precision_score(y_true, y_pred)
    print("Precision: {:.3f}".format(precision))
    # Compute the recall or sensitivity
    recall = recall_score(y_true, y_pred)
    print("Recall: {:.3f}".format(recall))
    # Compute the F1 score
    f1 = f1_score(y_true, y_pred)
    print("F1 score: {:.3f}".format(f1))
    # Compute the roc auc score
    auc = roc_auc_score(y_true,y_pred)
    print("AUC score: {:.3f}".format(auc))
    # Compute the log lossscore
    loss = log_loss(y_true,y_pred)
    print("Log loss: {:.3f}".format(loss))

    # Assume y_true and y_pred are the true and predicted labels for a binary classification problem
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calculate TPR and TNR
    tpr = tp / (tp + fn) # Sensitivity
    tnr = tn / (tn + fp) # Specificity
    # Calculate G-mean
    gmean = np.sqrt(tpr * tnr)
    print("G-mean: {:.3f}".format(gmean))

    print("Specificity: {:.3f}".format(tnr))

    # Print classification report and G-mean
    print(classification_report(y_true, y_pred))
    
    print("{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}".format(accuracy,precision,recall,f1,auc,loss,tnr))

    print(confusion_matrix(y_true,y_pred))

In [115]:
def train_and_evaluate_model(model, resampling_method, X, y, random_state):
    ### SPLITTING THE DATA ###

    # Split the data into training, validation, and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state = random_state)

    # Separate minority and majority classes
    minority_class = df[df['label'] == 1]
    majority_class = df[df['label'] == 0]
    
    print(len(majority_class), len(minority_class))
    print(len(X_train), len(X_val), len(X_test))
    print(len(y_train), len(y_val), len(y_test))
    # Resample
    start_sampling_time = time.time()
    X_resampled, y_resampled = resampling_method.fit_resample(X_train, y_train)
    stop_sampling_time = time.time()
    sampling_time = stop_sampling_time - start_sampling_time
    
    # Print the number of examples in each set
    print("Number of examples in the training set: ", len(X_train))
    print("Number of examples in the resampled dataset (label -1):", len(y_resampled[y_resampled == 0]))
    print("Number of examples in the resampled dataset (label 1):", len(y_resampled[y_resampled == 1]))
      
    print("Number of examples in the validation set: ", len(X_val))
    print("Number of examples in the test set: ", len(X_test))
  
    # Calculate the desired number of samples for each class based on the proportion in the test dataset
    desired_majority_samples = int(len(X_test) * 0.90)
    desired_minority_samples = int(len(X_test) * 0.10)

    # Resample the majority class in the test dataset
    resampled_majority_class = resample(majority_class,
                                        replace=True,
                                        n_samples=desired_majority_samples, 
                                        random_state = random_state
                                        
                                        )

    # Sample the minority class in the test dataset
    sampled_minority_class = resample(minority_class,
                                      replace=True,
                                      n_samples=desired_minority_samples, 
                                      random_state = random_state
                                      )

    test_imbalanced = pd.concat([resampled_majority_class, sampled_minority_class])
    X_test_imbalanced = test_imbalanced.drop(['label', 'account_id'], axis=1)
    y_test_imbalanced= test_imbalanced['label']

    # Print the number of examples in each class in the imbalanced test dataset
    print("Number of examples in the imbalanced test dataset (label -1):", len(y_test_imbalanced[y_test_imbalanced == 0]))
    print("Number of examples in the imbalanced test dataset (label 1):", len(y_test_imbalanced[y_test_imbalanced == 1]))
      
    ### MODELLING ###
    start_training_time = time.time()
    model.fit(X_resampled, y_resampled)
    stop_training_time = time.time()
    training_time = stop_training_time - start_training_time

    ### RESULTS ###
    ## Val
    start_pred_val = time.time()
    y_pred_val = model.predict(X_val)
    stop_pred_val = time.time()
    pred_val_time = stop_pred_val - start_pred_val

    accuracy_val = accuracy_score(y_val, y_pred_val)
    precision_val = precision_score(y_val, y_pred_val)
    recall_val = recall_score(y_val, y_pred_val)
    f1_val = f1_score(y_val, y_pred_val)
    auc_val = roc_auc_score(y_val,y_pred_val)
    loss_val = log_loss(y_val,y_pred_val)
    # Assume y_val and y_pred are the true and predicted labels for a binary classification problem
    tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_val, y_pred_val).ravel()
    # Calculate TPR and TNR
    tpr_val = tp_val / (tp_val + fn_val) # Sensitivity
    tnr_val = tn_val / (tn_val + fp_val) # Specificity
    # Calculate G-mean
    gmean_val = np.sqrt(tpr_val * tnr_val) 
    ## Test
    start_pred_test = time.time()
    y_pred_test = model.predict(X_test)
    stop_pred_test = time.time()
    pred_test_time = stop_pred_test - start_pred_test

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    auc_test = roc_auc_score(y_test,y_pred_test)
    loss_test = log_loss(y_test,y_pred_test)
    # Assume y_test and y_pred are the true and predicted labels for a binary classification problem
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
    # Calculate TPR and TNR
    tpr_test = tp_test / (tp_test + fn_test) # Sensitivity
    tnr_test = tn_test / (tn_test + fp_test) # Specificity
    # Calculate G-mean
    gmean_test = np.sqrt(tpr_test * tnr_test)
    
    ## test imbalanced
    start_pred_test_imbalanced = time.time()
    y_pred_test_imbalanced = model.predict(X_test_imbalanced)
    stop_pred_test_imbalanced = time.time()
    pred_test_imbalanced_time = stop_pred_test_imbalanced - start_pred_test_imbalanced

    accuracy_test_imbalanced = accuracy_score(y_test_imbalanced, y_pred_test_imbalanced)
    precision_test_imbalanced = precision_score(y_test_imbalanced, y_pred_test_imbalanced)
    recall_test_imbalanced = recall_score(y_test_imbalanced, y_pred_test_imbalanced)
    f1_test_imbalanced = f1_score(y_test_imbalanced, y_pred_test_imbalanced)
    auc_test_imbalanced = roc_auc_score(y_test_imbalanced,y_pred_test_imbalanced)
    loss_test_imbalanced = log_loss(y_test_imbalanced,y_pred_test_imbalanced)
    # Assume y_test_imbalanced and y_pred are the true and predicted labels for a binary classification problem
    tn_test_imbalanced, fp_test_imbalanced, fn_test_imbalanced, tp_test_imbalanced = confusion_matrix(y_test_imbalanced, y_pred_test_imbalanced).ravel()
    # Calculate TPR and TNR
    tpr_test_imbalanced = tp_test_imbalanced / (tp_test_imbalanced + fn_test_imbalanced) # Sensitivity
    tnr_test_imbalanced = tn_test_imbalanced / (tn_test_imbalanced + fp_test_imbalanced) # Specificity
    # Calculate G-mean
    gmean_test_imbalanced = np.sqrt(tpr_test_imbalanced * tnr_test_imbalanced) 
    
    print(confusion_matrix(y_test_imbalanced,y_pred_test_imbalanced))
    val_results = [accuracy_val, precision_val, recall_val, f1_val, auc_val, loss_val, tnr_val]
    test_results = [accuracy_test, precision_test, recall_test, f1_test, auc_test, loss_test, tnr_test]
    test_imbalanced_results = [accuracy_test_imbalanced, precision_test_imbalanced, recall_test_imbalanced, f1_test_imbalanced, auc_test_imbalanced, loss_test_imbalanced, tnr_test_imbalanced]

    time_results = [training_time, pred_val_time, pred_test_time, pred_test_imbalanced_time, sampling_time]
    return val_results, test_results, test_imbalanced_results, time_results
    # return accuracy, precision, recall, f1, auc, loss, tnr   

In [116]:
def run_iterations(model, resampling_method, X, y, iterations=5):
    results = []
    random_state = [12,23,34,45,56]
    for i in range(len(random_state)):
        val_results, test_results, test_imbalanced_results, time_results = train_and_evaluate_model(model, resampling_method, X, y, random_state[i])
        concatted_results = val_results + test_results + test_imbalanced_results + time_results
    
        results.append(concatted_results)

    columns = ['Val Accuracy', 'Val Precision', 'Val Recall', 'Val F1', 'Val AUC', 'Val Loss', 'Val Specificity',
               'Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1', 'Test AUC', 'Test Loss', 'Test Specificity',
               'Test Imbalanced Accuracy', 'Test Imbalanced Precision', 'Test Imbalanced Recall', 'Test Imbalanced F1', 'Test Imbalanced AUC', 'Test Imbalanced Loss', 'Test Imbalanced Specificity',
               'Training Time', 'Pred Val Time', 'Pred Test Time', 'Pred Test Imbalanced Time', 'Sampling Time'
               ]
    df = pd.DataFrame(results, columns=columns)
    print(df)
    stats = df.describe().loc[['mean']]
    # stats = df.describe()
    print(stats)
    return stats.to_csv(sep='\t')

In [117]:
def plot_auc(y_true, y_scores):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    sns.set_style("darkgrid")
    plt.figure(figsize=(8, 6))
    sns.lineplot(fpr, tpr, label='AUC = %0.2f' % roc_auc)
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.show()

In [118]:
def plot_feature_importance(rfc, feature_names):
    importances = rfc.feature_importances_
    indices = np.argsort(importances)[::-1]
    names = [feature_names[i] for i in indices]
    plt.figure()
    plt.title("Feature Importance")
    plt.bar(range(len(feature_names)), importances[indices])
    plt.xticks(range(len(feature_names)), names, rotation=90)
    plt.show()

### Dataset

In [136]:
# drive_path = '/content/drive/MyDrive/TA/Dataset/'
drive_path = '../dataset/'
dataset_name = 'data_e'
df = pd.read_csv(drive_path + dataset_name + '.csv')

In [137]:
target_col = 'label'

# df[target_col] = df[target_col].map({1: 1, 0: -1})  
# # Identify columns with boolean data type
# bool_columns = df.select_dtypes(include=['bool']).columns
# # Convert boolean columns to integer
# df[bool_columns] = df[bool_columns].astype(int)

In [138]:
df.head()

Unnamed: 0,account_id,label,order_count_with_promo_category_0,order_count_with_promo_category_1,order_count_with_promo_category_> 1,price_amount_category_0-280,price_amount_category_281-870,price_amount_category_871-2775,price_amount_category_> 2775,promo_amount_category_0-16,...,similar_email_category_1,similar_email_category_2,similar_email_category_3,similar_email_category_4,similar_email_category_5,similar_email_category_> 5,similar_device_category_0,similar_device_category_1,similar_device_category_2,similar_device_category_> 2
0,34634632,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,27536039,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,29164748,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,28115239,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,33491857,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [139]:
df['label'].value_counts()

0    68416
1     3128
Name: label, dtype: int64

In [140]:
df['label'].value_counts(normalize=True)

0    0.956279
1    0.043721
Name: label, dtype: float64

## Splitting the data

In [141]:
# Separate features (X) and target variable (y)
X = df.drop(['label','account_id'], axis=1)
y = df['label']

In [142]:
# print(len(X_train.columns))
# print(len(X_val.columns))
# print(len(X_test.columns))
# print(len(X_test_imbalanced.columns))

In [143]:
# print(len(y_train))
# print(len(y_val))
# print(len(y_test))
# print(len(y_test_imbalanced))

In [144]:
# y_train.value_counts()

In [145]:
# y_val.value_counts()

In [146]:
# y_test.value_counts()

In [147]:
# y_test_imbalanced.value_counts()

# Modeling

## SMOTE

In [148]:
# Define the model
rf = RandomForestClassifier(
    random_state=42
    )

smote = SMOTE()

print(run_iterations(rf, smote, X, y))

68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 41037
Number of examples in the resampled dataset (label 1): 41037
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11290  1588]
 [  755   675]]
68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 41091
Number of examples in the resampled dataset (label 1): 41091
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11257  1621]
 [  778   652]]
68416 3128
42926 14309 14309
42926 14309 14309
Numbe

## ADASYN

In [149]:
# Define the model
rf = RandomForestClassifier(
    random_state=42
    )

adasyn = ADASYN()

print(run_iterations(rf, adasyn, X, y))

68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 41037
Number of examples in the resampled dataset (label 1): 41669
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11292  1586]
 [  756   674]]
68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 41091
Number of examples in the resampled dataset (label 1): 41253
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11217  1661]
 [  777   653]]
68416 3128
42926 14309 14309
42926 14309 14309
Numbe

## ROS

In [150]:
# Define the model
rf = RandomForestClassifier(
    random_state=42
    )

ros = RandomOverSampler()

print(run_iterations(rf, ros, X, y))

68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 41037
Number of examples in the resampled dataset (label 1): 41037
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11233  1645]
 [  739   691]]
68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 41091
Number of examples in the resampled dataset (label 1): 41091
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11119  1759]
 [  741   689]]
68416 3128
42926 14309 14309
42926 14309 14309
Numbe

## RUS

In [151]:
# Define the model
rf = RandomForestClassifier(
    random_state=42
    )

rus = RandomUnderSampler()

print(run_iterations(rf, rus, X, y))

68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 1889
Number of examples in the resampled dataset (label 1): 1889
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11197  1681]
 [  736   694]]
68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 1835
Number of examples in the resampled dataset (label 1): 1835
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11159  1719]
 [  741   689]]
68416 3128
42926 14309 14309
42926 14309 14309
Number of

## SMOTEENN

In [152]:
# Define the model
rf = RandomForestClassifier(
    random_state=42
    )

smoteenn = SMOTEENN()

print(run_iterations(rf, smoteenn, X, y))

68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 39905
Number of examples in the resampled dataset (label 1): 1893
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12651   227]
 [ 1134   296]]
68416 3128
42926 14309 14309
42926 14309 14309
Number of examples in the training set:  42926
Number of examples in the resampled dataset (label -1): 40039
Number of examples in the resampled dataset (label 1): 1822
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label -1): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12828    50]
 [ 1360    70]]
68416 3128
42926 14309 14309
42926 14309 14309
Number 