In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import time

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For resampling
from collections import Counter
from sklearn.datasets import make_classification
# from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, ADASYN, RandomOverSampler
# from imblearn.under_sampling import NearMiss, RandomUnderSampler
# from imblearn.combine import SMOTEENN
from sklearn.utils import resample

# Ensemble Classifiers
from sklearn.ensemble import VotingClassifier, StackingClassifier

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, log_loss, classification_report, confusion_matrix

# from xgboost import plot_importance, to_graphviz

# from google.colab import files

In [2]:
# from google.colab import auth, drive
# from google.auth import default

In [3]:
# drive.mount('/content/drive',force_remount=True)

### Dataset

In [4]:
# drive_path = '/content/drive/MyDrive/TA/Dataset/'
drive_path = '../dataset/'
dataset_name = 'data_b'
df = pd.read_csv(drive_path + dataset_name + '.csv')

In [5]:
df.head()

Unnamed: 0,account_id,label,is_phone_number_verified,is_email_verified,is_reseller_flag,email_category_gmail,email_category_hotmail,email_category_icloud,email_category_live,email_category_others,...,promo_amount,category_f_order_count,category_f_order_count_with_promo,category_f_price_amount,category_f_promo_amount,similar_name_count,similar_email_count,similar_phone_number_count,similar_device_count,similar_birth_date_count
0,34634632,1,1,0,False,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27536039,0,1,0,False,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088889
2,29164748,0,0,0,False,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28115239,0,1,0,False,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,33491857,0,1,0,False,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71544 entries, 0 to 71543
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   account_id                         71544 non-null  int64  
 1   label                              71544 non-null  int64  
 2   is_phone_number_verified           71544 non-null  int64  
 3   is_email_verified                  71544 non-null  int64  
 4   is_reseller_flag                   71544 non-null  bool   
 5   email_category_gmail               71544 non-null  float64
 6   email_category_hotmail             71544 non-null  float64
 7   email_category_icloud              71544 non-null  float64
 8   email_category_live                71544 non-null  float64
 9   email_category_others              71544 non-null  float64
 10  email_category_outlook             71544 non-null  float64
 11  email_category_rocketmail          71544 non-null  flo

In [7]:
df['label'].value_counts()

0    68416
1     3128
Name: label, dtype: int64

In [8]:
df['label'].value_counts(normalize=True)

0    0.956279
1    0.043721
Name: label, dtype: float64

## Splitting the data

In [9]:
# Separate features (X) and target variable (y)
X = df.drop(['label','account_id'], axis=1)
y = df['label']


In [10]:
# print(len(X_train.columns))
# print(len(X_val.columns))
# print(len(X_test.columns))
# print(len(X_test_imbalanced.columns))

In [11]:
# print(len(y_train))
# print(len(y_val))
# print(len(y_test))
# print(len(y_test_imbalanced))

In [12]:
# y_train.value_counts()

In [13]:
# y_val.value_counts()

In [14]:
# y_test.value_counts()

In [15]:
# y_test_imbalanced.value_counts()

# Functions

In [16]:
def print_classification_metrics(y_true, y_pred):
    # Compute the accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print("Accuracy: {:.3f}".format(accuracy))
    # Compute the precision
    precision = precision_score(y_true, y_pred)
    print("Precision: {:.3f}".format(precision))
    # Compute the recall or sensitivity
    recall = recall_score(y_true, y_pred)
    print("Recall: {:.3f}".format(recall))
    # Compute the F1 score
    f1 = f1_score(y_true, y_pred)
    print("F1 score: {:.3f}".format(f1))
    # Compute the roc auc score
    auc = roc_auc_score(y_true,y_pred)
    print("AUC score: {:.3f}".format(auc))
    # Compute the log lossscore
    loss = log_loss(y_true,y_pred)
    print("Log loss: {:.3f}".format(loss))

    # Assume y_true and y_pred are the true and predicted labels for a binary classification problem
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calculate TPR and TNR
    tpr = tp / (tp + fn) # Sensitivity
    tnr = tn / (tn + fp) # Specificity
    # Calculate G-mean
    gmean = np.sqrt(tpr * tnr)
    print("G-mean: {:.3f}".format(gmean))

    print("Specificity: {:.3f}".format(tnr))

    # Print classification report and G-mean
    print(classification_report(y_true, y_pred))
    
    print("{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}".format(accuracy,precision,recall,f1,auc,loss,tnr))

    print(confusion_matrix(y_true,y_pred))

In [17]:
def train_and_evaluate_model(model, X, y, random_state):
    ### SPLITTING THE DATA ###
    # Separate minority and majority classes
    minority_class = df[df['label'] == 1]
    majority_class = df[df['label'] == 0]

    # # Undersample majority class
    # undersampled_majority_class = resample(majority_class, 
    #                                       replace=False, 
    #                                       n_samples=len(minority_class),
    #                                       )

    # # Combine minority class with undersampled majority class
    # undersampled_data = pd.concat([minority_class, undersampled_majority_class])

    # # Split the undersampled data into training, validation, and test sets
    # X_undersampled = undersampled_data.drop(['label','account_id'], axis=1)
    # y_undersampled = undersampled_data['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state = random_state)

    # Print the number of examples in each set
    print("Number of examples in the training set: ", len(X_train))
    print("Number of examples in the validation set: ", len(X_val))
    print("Number of examples in the test set: ", len(X_test))
  
    # Calculate the desired number of samples for each class based on the proportion in the test dataset
    desired_majority_samples = int(len(X_test) * 0.90)
    desired_minority_samples = int(len(X_test) * 0.10)

    # Resample the majority class in the test dataset
    resampled_majority_class = resample(majority_class,
                                        replace=True,
                                        n_samples=desired_majority_samples,
                                        random_state = random_state
                                        )

    # Sample the minority class in the test dataset
    sampled_minority_class = resample(minority_class,
                                      replace=True,
                                      n_samples=desired_minority_samples,
                                      random_state = random_state
                                      )

    test_imbalanced = pd.concat([resampled_majority_class, sampled_minority_class])
    X_test_imbalanced = test_imbalanced.drop(['label', 'account_id'], axis=1)
    y_test_imbalanced= test_imbalanced['label']

    # Print the number of examples in each class in the imbalanced test dataset
    print("Number of examples in the imbalanced test dataset (label 0):", len(y_test_imbalanced[y_test_imbalanced == 0]))
    print("Number of examples in the imbalanced test dataset (label 1):", len(y_test_imbalanced[y_test_imbalanced == 1]))
      
    ### MODELLING ###
    start_training_time = time.time()
    model.fit(X_train, y_train)
    stop_training_time = time.time()
    training_time = stop_training_time - start_training_time

    ### RESULTS ###
    ## Val
    start_pred_val = time.time()
    y_pred_val = model.predict(X_val)
    stop_pred_val = time.time()
    pred_val_time = stop_pred_val - start_pred_val

    accuracy_val = accuracy_score(y_val, y_pred_val)
    precision_val = precision_score(y_val, y_pred_val)
    recall_val = recall_score(y_val, y_pred_val)
    f1_val = f1_score(y_val, y_pred_val)
    auc_val = roc_auc_score(y_val,y_pred_val)
    loss_val = log_loss(y_val,y_pred_val)
    # Assume y_val and y_pred are the true and predicted labels for a binary classification problem
    tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_val, y_pred_val).ravel()
    # Calculate TPR and TNR
    tpr_val = tp_val / (tp_val + fn_val) # Sensitivity
    tnr_val = tn_val / (tn_val + fp_val) # Specificity
    # Calculate G-mean
    gmean_val = np.sqrt(tpr_val * tnr_val) 
    ## Test
    start_pred_test = time.time()
    y_pred_test = model.predict(X_test)
    stop_pred_test = time.time()
    pred_test_time = stop_pred_test - start_pred_test

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    auc_test = roc_auc_score(y_test,y_pred_test)
    loss_test = log_loss(y_test,y_pred_test)
    # Assume y_test and y_pred are the true and predicted labels for a binary classification problem
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
    # Calculate TPR and TNR
    tpr_test = tp_test / (tp_test + fn_test) # Sensitivity
    tnr_test = tn_test / (tn_test + fp_test) # Specificity
    # Calculate G-mean
    gmean_test = np.sqrt(tpr_test * tnr_test)
    
    ## test imbalanced
    start_pred_test_imbalanced = time.time()
    y_pred_test_imbalanced = model.predict(X_test_imbalanced)
    stop_pred_test_imbalanced = time.time()
    pred_test_imbalanced_time = stop_pred_test_imbalanced - start_pred_test_imbalanced

    accuracy_test_imbalanced = accuracy_score(y_test_imbalanced, y_pred_test_imbalanced)
    precision_test_imbalanced = precision_score(y_test_imbalanced, y_pred_test_imbalanced)
    recall_test_imbalanced = recall_score(y_test_imbalanced, y_pred_test_imbalanced)
    f1_test_imbalanced = f1_score(y_test_imbalanced, y_pred_test_imbalanced)
    auc_test_imbalanced = roc_auc_score(y_test_imbalanced,y_pred_test_imbalanced)
    loss_test_imbalanced = log_loss(y_test_imbalanced,y_pred_test_imbalanced)
    # Assume y_test_imbalanced and y_pred are the true and predicted labels for a binary classification problem
    tn_test_imbalanced, fp_test_imbalanced, fn_test_imbalanced, tp_test_imbalanced = confusion_matrix(y_test_imbalanced, y_pred_test_imbalanced).ravel()
    # Calculate TPR and TNR
    tpr_test_imbalanced = tp_test_imbalanced / (tp_test_imbalanced + fn_test_imbalanced) # Sensitivity
    tnr_test_imbalanced = tn_test_imbalanced / (tn_test_imbalanced + fp_test_imbalanced) # Specificity
    # Calculate G-mean
    gmean_test_imbalanced = np.sqrt(tpr_test_imbalanced * tnr_test_imbalanced) 

    print(confusion_matrix(y_test_imbalanced,y_pred_test_imbalanced))
    val_results = [accuracy_val, precision_val, recall_val, f1_val, auc_val, loss_val, tnr_val]
    test_results = [accuracy_test, precision_test, recall_test, f1_test, auc_test, loss_test, tnr_test]
    test_imbalanced_results = [accuracy_test_imbalanced, precision_test_imbalanced, recall_test_imbalanced, f1_test_imbalanced, auc_test_imbalanced, loss_test_imbalanced, tnr_test_imbalanced]

    time_results = [training_time, pred_val_time, pred_test_time, pred_test_imbalanced_time]
    return val_results, test_results, test_imbalanced_results, time_results
    # return accuracy, precision, recall, f1, auc, loss, tnr   

In [18]:
def run_iterations(model, X, y, iterations=5):
    results = []
    random_state = [12,23,34,45,56]
    for i in range(len(random_state)):
        val_results, test_results, test_imbalanced_results, time_results = train_and_evaluate_model(model, X, y, random_state[i])
        concatted_results = val_results + test_results + test_imbalanced_results + time_results
    
        results.append(concatted_results)

    columns = ['Val Accuracy', 'Val Precision', 'Val Recall', 'Val F1', 'Val AUC', 'Val Loss', 'Val Specificity',
               'Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1', 'Test AUC', 'Test Loss', 'Test Specificity',
               'Test Imbalanced Accuracy', 'Test Imbalanced Precision', 'Test Imbalanced Recall', 'Test Imbalanced F1', 'Test Imbalanced AUC', 'Test Imbalanced Loss', 'Test Imbalanced Specificity',
               'Training Time', 'Pred Val Time', 'Pred Test Time', 'Pred Test Imbalanced Time'
               ]
    df = pd.DataFrame(results, columns=columns)
#     print(df)
    stats = df.describe().loc[['mean']]
    # stats = df.describe()
    print(stats)
    return stats.to_csv(sep='\t')

In [19]:
# def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     auc = roc_auc_score(y_test,y_pred)
#     loss = log_loss(y_test,y_pred)
#     # Assume y_test and y_pred are the true and predicted labels for a binary classification problem
#     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
#     # Calculate TPR and TNR
#     tpr = tp / (tp + fn) # Sensitivity
#     tnr = tn / (tn + fp) # Specificity
#     # Calculate G-mean
#     gmean = np.sqrt(tpr * tnr)
#     return accuracy, precision, recall, f1, auc, loss, tnr    

In [20]:
# def run_iterations(model, X_train, y_train, X_test, y_test, iterations=5):
#     results = []
#     for i in range(iterations):
#         accuracy, precision, recall, f1, auc, loss, tnr = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
#         results.append([accuracy, precision, recall, f1, auc, loss, tnr])
#     columns = ['Accuracy', 'Precision', 'Recall', 'F1', 'AUC', 'Loss', 'Specificity']
#     df = pd.DataFrame(results, columns=columns)
#     print(df)
#     median = df.median()
#     return median.to_csv(sep='\t')

In [21]:
def plot_auc(y_true, y_scores):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    sns.set_style("darkgrid")
    plt.figure(figsize=(8, 6))
    sns.lineplot(fpr, tpr, label='AUC = %0.2f' % roc_auc)
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.show()

In [22]:
def plot_feature_importance(rfc, feature_names):
    importances = rfc.feature_importances_
    indices = np.argsort(importances)[::-1]
    names = [feature_names[i] for i in indices]
    plt.figure()
    plt.title("Feature Importance")
    plt.bar(range(len(feature_names)), importances[indices])
    plt.xticks(range(len(feature_names)), names, rotation=90)
    plt.show()

In [23]:
# df_2_train = pd.concat([X_train, y_train], axis=1)
# df_2_test = pd.concat([X_test, y_test],axis=1)

In [24]:
# from google.colab import files

# # Save the DataFrame to a CSV file
# df_2_test.to_csv('df_2_test.csv', index=False)

# # Download the CSV file to your local machine
# files.download('df_2_test.csv')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71544 entries, 0 to 71543
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   account_id                         71544 non-null  int64  
 1   label                              71544 non-null  int64  
 2   is_phone_number_verified           71544 non-null  int64  
 3   is_email_verified                  71544 non-null  int64  
 4   is_reseller_flag                   71544 non-null  bool   
 5   email_category_gmail               71544 non-null  float64
 6   email_category_hotmail             71544 non-null  float64
 7   email_category_icloud              71544 non-null  float64
 8   email_category_live                71544 non-null  float64
 9   email_category_others              71544 non-null  float64
 10  email_category_outlook             71544 non-null  float64
 11  email_category_rocketmail          71544 non-null  flo

In [26]:
counts = df.isna().sum()
selected_data = df.columns.isin(counts[(counts > 0 )].index)
selected_data

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False])

In [27]:
len(df)

71544

In [28]:
# X[X['day_to_first_transaction'].isna()]

# Modeling

## Base Model (Random Forest Classifier)

In [35]:
# Define the model
rf = RandomForestClassifier(
    
    )

print(run_iterations(rf, X, y))

Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12836    42]
 [  497   933]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12839    39]
 [  561   869]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12848    30]
 [  526   904]]
Number of examples in the training set:  42926
Number of examples in the validation set:  1

## Using Class Weight Balanced

In [30]:
# Define the model
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
print(run_iterations(rf, X, y))

Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12818    60]
 [  479   951]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12821    57]
 [  545   885]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12816    62]
 [  504   926]]
Number of examples in the training set:  42926
Number of examples in the validation set:  1

## With hyperparameter

In [31]:
# Define the model
rf = RandomForestClassifier(random_state=42, class_weight='balanced', 
                            max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=200)
print(run_iterations(rf, X, y))

Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12731   147]
 [  427  1003]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12739   139]
 [  484   946]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[12734   144]
 [  459   971]]
Number of examples in the training set:  42926
Number of examples in the validation set:  1

In [32]:
# # define the hyperparameter grid to search over

# # Number of trees in random forest
# # n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# n_estimators = [50,100,200]
# # Number of features to consider at every split
# # max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [2,4,6,8,None]
# # max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# # max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]

# param_grid = {
#     'n_estimators': n_estimators,
#     # 'max_features': max_features,
#     'max_depth': max_depth,
#     'min_samples_split': min_samples_split,
#     'min_samples_leaf': min_samples_leaf,
#     'bootstrap': bootstrap
# }

# # create a random forest classifier object
# rf = RandomForestClassifier(random_state=42)

# # create a grid search object
# grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1', verbose=2)

# # fit the grid search object to the data
# grid_search.fit(X_train, y_train)

# # print the best hyperparameters and corresponding f1 score
# print('Best hyperparameters:', grid_search.best_params_)
# print('Best F1 score:', grid_search.best_score_)

In [33]:
# data_testing_01
# Best hyperparameters: {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


# data_full_1
# Best hyperparameters: {'bootstrap': False, 'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


## Hyperparameter 2

In [34]:
# Define the model
rf = RandomForestClassifier(random_state=42, class_weight='balanced', 
                            bootstrap = False, max_depth= 6, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 100
                              # bootstrap = False, max_depth= 10, min_samples_leaf = 4, min_samples_split = 2, n_estimators = 100
)
print(run_iterations(rf, X, y))

Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11526  1352]
 [  710   720]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11522  1356]
 [  734   696]]
Number of examples in the training set:  42926
Number of examples in the validation set:  14309
Number of examples in the test set:  14309
Number of examples in the imbalanced test dataset (label 0): 12878
Number of examples in the imbalanced test dataset (label 1): 1430
[[11556  1322]
 [  734   696]]
Number of examples in the training set:  42926
Number of examples in the validation set:  1