In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, average_precision_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns
from sklearn import preprocessing

In [2]:
# Load the dataset
data = pd.read_csv('creditcard.csv')

# Preprocessing
X = data.drop(['Class', 'Time'], axis=1)
y = data['Class']
scaler = StandardScaler()
X[['Amount']] = scaler.fit_transform(X[['Amount']])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=False, random_state=None)

In [3]:
# Skewness correction
pt = preprocessing.PowerTransformer(method='yeo-johnson', copy=True)
pt.fit(X_train)
X_train_pt = pt.transform(X_train)
X_test_pt = pt.transform(X_test)


In [4]:
# Resampling techniques
resampling_methods = {
    'None': (X_train, y_train),
    'SMOTE': SMOTE(random_state=0).fit_resample(X_train, y_train),
    'SMOTE_PT': SMOTE(random_state=0).fit_resample(X_train_pt, y_train),
    'RandomUnderSampler': RandomUnderSampler(random_state=0).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=0).fit_resample(X_train, y_train)
}

In [5]:
# Classifiers
classifiers = {
    'XGBoost': (XGBClassifier(n_jobs=-1, objective='binary:logistic', eval_metric='auc'), {
        'max_depth': [5, 10],
        'learning_rate': [0.1, 0.3, 0.8],
        'subsample': [0.9],
        'n_estimators': [200,250,300],
    }),
    'RandomForest': (RandomForestClassifier(n_jobs=-1), {
        'n_estimators': [100, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 3],
    }),
}

In [6]:
# DataFrame to store the results
results_df = pd.DataFrame(columns=['Classifier', 'Resampling', 'AUPRC', 'F1', 'Recall', 'Precision', 'Best Parameters', 'Time'])


In [7]:
# Train and evaluate classifiers
row_index = 0
for resampling_name, (X_train_resampled, y_train_resampled) in resampling_methods.items():
    for classifier_name, (classifier, params) in classifiers.items():
        start_time = time.time()
        print(f"Training {classifier_name} classifier using {resampling_name} resampling...")
        grid_search = GridSearchCV(classifier, params, scoring='average_precision', cv=stratified_kfold, verbose=2, n_jobs=-1)
        grid_search.fit(X_train_resampled, y_train_resampled)
        y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
        y_pred = grid_search.predict(X_test)
        auprc = average_precision_score(y_true=y_test, y_score=y_pred_proba)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Finished training {classifier_name} classifier using {resampling_name} resampling. Elapsed time: {elapsed_time:.2f} seconds.")
        results_df.loc[row_index] = [classifier_name, resampling_name, auprc, f1, recall, precision, grid_search.best_params_, elapsed_time]
        row_index += 1

Training XGBoost classifier using None resampling...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Finished training XGBoost classifier using None resampling. Elapsed time: 281.75 seconds.
Training RandomForest classifier using None resampling...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END learning_rate=0.8, max_depth=10, n_estimators=200, subsample=0.9; total time= 1.2min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.7min




[CV] END learning_rate=0.3, max_depth=5, n_estimators=250, subsample=0.9; total time= 2.6min
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=6, n_estimators=100; total time= 1.9min
[CV] END learning_rate=0.8, max_depth=5, n_estimators=300, subsample=0.9; total time= 1.9min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 2.0min
[CV] END learning_rate=0.3, max_depth=5, n_estimators=250, subsample=0.9; total time= 2.2min
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 2.0min
[CV] END learning_rate=0.3, max_depth=5, n_estimators=300, subsample=0.9; total time= 2.4min
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time= 2.1min
[CV] END learning_rate=0.3, max_depth=10, n_estimators=200, subsample=0.9; total time= 2.2min
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 2.1min
[CV] END learning_rate=0.8, max_de



[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 1.7min
[CV] END learning_rate=0.1, max_depth=10, n_estimators=250, subsample=0.9; total time=13.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 6.9min
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 1.6min
[CV] END learning_rate=0.1, max_depth=10, n_estimators=250, subsample=0.9; total time=13.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 6.8min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 5.2min
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time= 3.5min
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time= 4.5min
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 4.4min
[C



[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 8.4min
[CV] END learning_rate=0.1, max_depth=10, n_estimators=250, subsample=0.9; total time=12.9min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.6min
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=300; total time= 8.6min
[CV] END learning_rate=0.1, max_depth=10, n_estimators=300, subsample=0.9; total time=15.9min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 8.3min
[CV] END max_depth=None, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 8.2min
[CV] END learning_rate=0.1, max_depth=10, n_estimators=200, subsample=0.9; total time=13.2min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 8.6min
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 4.2min
[CV



Finished training RandomForest classifier using SMOTE_PT resampling. Elapsed time: 816.33 seconds.
Training XGBoost classifier using RandomUnderSampler resampling...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Finished training XGBoost classifier using RandomUnderSampler resampling. Elapsed time: 4.88 seconds.
Training RandomForest classifier using RandomUnderSampler resampling...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Finished training RandomForest classifier using RandomUnderSampler resampling. Elapsed time: 2.73 seconds.
Training XGBoost classifier using ADASYN resampling...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END max_depth=None, min_samples_leaf=3, min_samples_split=6, n_estimators=100; total time= 9.4min
[CV] END learning_rate=0.1, max_depth=10, n_estimators=300, subsample=0.9; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=6, n_estimators=300; total time=   1.5s
[CV] END max_dep

In [8]:
# Print the results
print("Classifier performance (AUPRC, F1 Score, Recall, Precision), best parameters, and time:")
results_df.head(len(resampling_methods) * len(classifiers))

Classifier performance (AUPRC, F1 Score, Recall, Precision), best parameters, and time:


Unnamed: 0,Classifier,Resampling,AUPRC,F1,Recall,Precision,Best Parameters,Time
0,XGBoost,,0.879011,0.854054,0.806122,0.908046,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",281.747022
1,RandomForest,,0.86576,0.864865,0.816327,0.91954,"{'max_depth': None, 'min_samples_leaf': 3, 'mi...",397.724832
2,XGBoost,SMOTE,0.882779,0.835821,0.857143,0.815534,"{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",1034.077762
3,RandomForest,SMOTE,0.877281,0.827225,0.806122,0.849462,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",753.278762
4,XGBoost,SMOTE_PT,0.825749,0.528875,0.887755,0.376623,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",1032.819158
5,RandomForest,SMOTE_PT,0.784074,0.804348,0.755102,0.860465,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",816.326142
6,XGBoost,RandomUnderSampler,0.445876,0.054507,0.928571,0.028078,"{'learning_rate': 0.3, 'max_depth': 10, 'n_est...",4.88482
7,RandomForest,RandomUnderSampler,0.826145,0.130909,0.918367,0.070478,"{'max_depth': 5, 'min_samples_leaf': 3, 'min_s...",2.730231
8,XGBoost,ADASYN,0.870232,0.83,0.846939,0.813725,"{'learning_rate': 0.3, 'max_depth': 10, 'n_est...",1084.468952
9,RandomForest,ADASYN,0.860551,0.79397,0.806122,0.782178,"{'max_depth': None, 'min_samples_leaf': 3, 'mi...",720.259642


In [11]:
results_df.iloc[2]['Best Parameters']

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300, 'subsample': 0.9}

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 9.6min
[CV] END max_depth=None, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time= 9.6min
[CV] END max_depth=None, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time= 9.6min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time= 9.6min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 9.8min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=300; total time=10.0min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=6, n_estimators=300; total time=10.1min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=10.4min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=6, n_estimators=300; total time=10.7min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300;