In [1]:
# %load_ext cuml.accel

In [2]:
# !pip install imblearn

In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score,
    confusion_matrix, balanced_accuracy_score
)
from imblearn.metrics import geometric_mean_score
import time


In [2]:
!curl -L -o ../data/creditcardfraud.zip https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 65.9M  100 65.9M    0     0  2343k      0  0:00:28  0:00:28 --:--:-- 2883k


In [None]:
!unzip ../data/creditcardfraud.zip 

Archive:  ../data/creditcardfraud.zip
caution: filename not matched:  ../data/


In [4]:
df = pd.read_csv('../data/creditcard.csv')

In [5]:
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time'] = scaler.fit_transform(df['Time'].values.reshape(-1, 1))

In [6]:
X = df.drop('Class', axis=1)
y = df['Class']

In [7]:
RANDOM_SEED = 42
CV_FOLDS = 5

In [8]:
samplers = {
    'None': None,
    'Random OverSampling': RandomOverSampler(random_state=RANDOM_SEED),
    'SMOTE': SMOTE(random_state=RANDOM_SEED),
    'ADASYN': ADASYN(random_state=RANDOM_SEED),
    'SMOTETomek': SMOTETomek(random_state=RANDOM_SEED)
}

In [9]:
# Define standard classification models (without built-in class balancing)
models = {
    'XGBoost': xgb.XGBClassifier(objective='binary:logistic', n_jobs=-1, random_state=42),
    'XGBoost_b': xgb.XGBClassifier(objective='binary:logistic', n_jobs=-1, random_state=42)
}

In [10]:
def evaluate_models_with_sampling(samplers, models, X, y):
    """
    Evaluate combinations of sampling methods and classification models using cross-validation.

    Parameters:
    -----------
    samplers : dict
        Dictionary of sampling methods with name as key and sampler object as value
    std_models : dict
        Dictionary of standard classification models
    bal_models : dict
        Dictionary of classification models with built-in class balancing
    X : DataFrame
        Feature matrix
    y : Series
        Target vector

    Returns:
    --------
    detailed_results : DataFrame
        Detailed results for each fold, sampler, and model
    summary_results : DataFrame
        Aggregated results across folds with mean and std for each metric
    """
    # Initialize stratified K-fold cross-validation
    stratified_kfold = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_SEED)
    detailed_results = []

    # Print experiment setup information
    print(f"Starting evaluation with {CV_FOLDS}-fold cross-validation")
    print(f"Number of samples: {len(X)}")
    print(f"Class distribution: {dict(y.value_counts())}")
    print(f"Class imbalance ratio: 1:{y.value_counts()[0]/y.value_counts()[1]:.1f}")
    print("-" * 50)

    # Loop through each fold
    for fold_idx, (train_idx, test_idx) in enumerate(stratified_kfold.split(X, y), 1):
        fold_start_time = time.time()
        print(f"\n[FOLD {fold_idx}/{CV_FOLDS}]")

        # Split data for this fold
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
        print(f"Training class distribution: {dict(y_train.value_counts())}")

        # Process each sampling method
        for sampler_name, sampler in samplers.items():
            print(f"\n[Sampler: {sampler_name}]")

            # Apply sampling method if not None
            if sampler_name == 'None':
                X_resampled, y_resampled = X_train, y_train
                print("No resampling applied")
                # For no resampling, use both standard and balanced models
                models_to_use = models
            else:
                sampler_start_time = time.time()
                X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
                print(f"Resampling completed in {time.time() - sampler_start_time:.2f}s")
                print(f"Resampled class distribution: {dict(pd.Series(y_resampled).value_counts())}")
                # For explicit resampling, use only standard models
                models_to_use = models

            # Train and evaluate each model
            for model_name, model in models_to_use.items():
                model_start_time = time.time()
                print(f"  Training {model_name}...", end=" ")
                if model_name == 'XGBoost_b':
                    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
                    model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight)
                # Train the model
                model.fit(X_resampled, y_resampled)

                # Make predictions
                y_pred = model.predict(X_test)
                y_prob = model.predict_proba(X_test)[:, 1]
                model_training_time = time.time() - model_start_time

                # Calculate performance metrics
                precision = precision_score(y_test, y_pred, zero_division=0)
                recall = recall_score(y_test, y_pred, zero_division=0)
                f1 = f1_score(y_test, y_pred, zero_division=0)
                auc_roc = roc_auc_score(y_test, y_prob)
                auc_pr = average_precision_score(y_test, y_prob)
                gmean = geometric_mean_score(y_test, y_pred, average='binary')
                bal_acc = balanced_accuracy_score(y_test, y_pred)
                conf_matrix = confusion_matrix(y_test, y_pred)

                print(f"done in {model_training_time:.2f}s")
                print(f"  Metrics: F1={f1:.4f}, Recall={recall:.4f}, AUC-ROC={auc_roc:.4f}")

                # Store results
                detailed_results.append({
                    'fold': fold_idx,
                    'sampler': sampler_name,
                    'model': model_name,
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1,
                    'auc_roc': auc_roc,
                    'auc_pr': auc_pr,
                    'gmean': gmean,
                    'balanced_accuracy': bal_acc,
                    'confusion_matrix': conf_matrix,
                    'training_time': model_training_time,
                    'fold_time': None  # Will be filled later
                })

        # Calculate and store fold completion time
        fold_total_time = time.time() - fold_start_time
        print(f"\nFold {fold_idx} completed in {fold_total_time:.2f}s")

        # Update fold time for all results from this fold
        for result in detailed_results:
            if result['fold'] == fold_idx and result['fold_time'] is None:
                result['fold_time'] = fold_total_time

    print("\n" + "=" * 50)
    print("Cross-validation completed!")

    # Convert results to DataFrame
    results_df = pd.DataFrame(detailed_results)

    # Aggregate results across folds
    summary_results = results_df.groupby(['sampler', 'model'])[
        ['precision', 'recall', 'f1_score', 'auc_roc', 'auc_pr',
         'gmean', 'balanced_accuracy', 'training_time']
    ].agg(['mean', 'std'])

    print(f"Total models evaluated: {len(results_df['model'].unique())}")
    print(f"Total samplers evaluated: {len(results_df['sampler'].unique())}")

    return results_df, summary_results

In [11]:
# Execute the evaluation
detailed_results, summary_results = evaluate_models_with_sampling(
    samplers, models, X, y)

Starting evaluation with 5-fold cross-validation
Number of samples: 284807
Class distribution: {0: 284315, 1: 492}
Class imbalance ratio: 1:577.9
--------------------------------------------------

[FOLD 1/5]
Training set size: 227845, Test set size: 56962
Training class distribution: {0: 227452, 1: 393}

[Sampler: None]
No resampling applied
  Training XGBoost... done in 1.70s
  Metrics: F1=0.8000, Recall=0.7273, AUC-ROC=0.9650
  Training XGBoost_b... done in 2.59s
  Metrics: F1=0.8466, Recall=0.8081, AUC-ROC=0.9804

[Sampler: Random OverSampling]
Resampling completed in 0.39s
Resampled class distribution: {0: 227452, 1: 227452}
  Training XGBoost... done in 3.41s
  Metrics: F1=0.8438, Recall=0.8182, AUC-ROC=0.9793
  Training XGBoost_b... done in 3.51s
  Metrics: F1=0.8122, Recall=0.8081, AUC-ROC=0.9586

[Sampler: SMOTE]
Resampling completed in 0.66s
Resampled class distribution: {0: 227452, 1: 227452}
  Training XGBoost... done in 3.01s
  Metrics: F1=0.8155, Recall=0.8485, AUC-ROC=0.

In [12]:
summary_results

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,precision,recall,recall,f1_score,f1_score,auc_roc,auc_roc,auc_pr,auc_pr,gmean,gmean,balanced_accuracy,balanced_accuracy,training_time,training_time
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
sampler,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
ADASYN,XGBoost,0.695698,0.058332,0.839394,0.022328,0.760125,0.041198,0.977066,0.008427,0.851556,0.0303,0.915825,0.012199,0.919375,0.011211,3.484787,0.549184
ADASYN,XGBoost_b,0.469927,0.032106,0.843434,0.02683,0.602979,0.027522,0.967161,0.007384,0.834959,0.04473,0.917534,0.014461,0.920889,0.013398,4.261705,0.927137
,XGBoost,0.896964,0.043135,0.770398,0.032725,0.828476,0.031495,0.944568,0.021605,0.795835,0.038207,0.877497,0.01864,0.885122,0.016375,2.133231,0.418297
,XGBoost_b,0.913402,0.050737,0.819068,0.024587,0.863221,0.031184,0.979543,0.009114,0.859004,0.034371,0.90488,0.013613,0.909466,0.01231,2.237164,0.346303
Random OverSampling,XGBoost,0.91968,0.041203,0.831251,0.02779,0.873107,0.032346,0.976356,0.01132,0.860496,0.025552,0.911571,0.015252,0.915562,0.013921,3.482533,0.552251
Random OverSampling,XGBoost_b,0.842009,0.040902,0.825211,0.01656,0.833105,0.02261,0.962919,0.014353,0.841472,0.024093,0.908252,0.009132,0.91247,0.008282,3.679497,0.428498
SMOTE,XGBoost,0.751159,0.04586,0.845475,0.01872,0.795162,0.032726,0.974582,0.007672,0.855723,0.026337,0.919228,0.010213,0.922493,0.009399,4.005474,0.974521
SMOTE,XGBoost_b,0.515257,0.027571,0.849577,0.019595,0.641057,0.022267,0.96574,0.013687,0.839124,0.026029,0.921036,0.010581,0.924094,0.009782,3.689637,0.373891
SMOTETomek,XGBoost,0.751159,0.04586,0.845475,0.01872,0.795162,0.032726,0.974582,0.007672,0.855723,0.026337,0.919228,0.010213,0.922493,0.009399,4.205989,0.378815
SMOTETomek,XGBoost_b,0.515257,0.027571,0.849577,0.019595,0.641057,0.022267,0.96574,0.013687,0.839124,0.026029,0.921036,0.010581,0.924094,0.009782,4.146834,0.55532


In [13]:
detailed_results

Unnamed: 0,fold,sampler,model,precision,recall,f1_score,auc_roc,auc_pr,gmean,balanced_accuracy,confusion_matrix,training_time,fold_time
0,1,,XGBoost,0.888889,0.727273,0.8,0.96497,0.800955,0.852735,0.863557,"[[56854, 9], [27, 72]]",1.697522,187.739557
1,1,,XGBoost_b,0.888889,0.808081,0.846561,0.980448,0.843396,0.898854,0.903952,"[[56853, 10], [19, 80]]",2.594723,187.739557
2,1,Random OverSampling,XGBoost,0.870968,0.818182,0.84375,0.979334,0.847629,0.904439,0.908985,"[[56851, 12], [18, 81]]",3.414865,187.739557
3,1,Random OverSampling,XGBoost_b,0.816327,0.808081,0.812183,0.958619,0.823839,0.898791,0.903882,"[[56845, 18], [19, 80]]",3.505185,187.739557
4,1,SMOTE,XGBoost,0.785047,0.848485,0.815534,0.963538,0.854276,0.920946,0.92404,"[[56840, 23], [15, 84]]",3.009512,187.739557
5,1,SMOTE,XGBoost_b,0.535948,0.828283,0.650794,0.944815,0.833633,0.909532,0.913517,"[[56792, 71], [17, 82]]",3.168658,187.739557
6,1,ADASYN,XGBoost,0.700855,0.828283,0.759259,0.971318,0.836635,0.90982,0.913834,"[[56828, 35], [17, 82]]",2.804614,187.739557
7,1,ADASYN,XGBoost_b,0.515723,0.828283,0.635659,0.955039,0.834989,0.909484,0.913464,"[[56786, 77], [17, 82]]",3.044012,187.739557
8,1,SMOTETomek,XGBoost,0.785047,0.848485,0.815534,0.963538,0.854276,0.920946,0.92404,"[[56840, 23], [15, 84]]",4.720524,187.739557
9,1,SMOTETomek,XGBoost_b,0.535948,0.828283,0.650794,0.944815,0.833633,0.909532,0.913517,"[[56792, 71], [17, 82]]",4.617362,187.739557


In [14]:
slice = detailed_results.groupby(['sampler', 'model'])[['recall', 'auc_pr', 'f1_score']].mean()
slice

Unnamed: 0_level_0,Unnamed: 1_level_0,recall,auc_pr,f1_score
sampler,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ADASYN,XGBoost,0.839394,0.851556,0.760125
ADASYN,XGBoost_b,0.843434,0.834959,0.602979
,XGBoost,0.770398,0.795835,0.828476
,XGBoost_b,0.819068,0.859004,0.863221
Random OverSampling,XGBoost,0.831251,0.860496,0.873107
Random OverSampling,XGBoost_b,0.825211,0.841472,0.833105
SMOTE,XGBoost,0.845475,0.855723,0.795162
SMOTE,XGBoost_b,0.849577,0.839124,0.641057
SMOTETomek,XGBoost,0.845475,0.855723,0.795162
SMOTETomek,XGBoost_b,0.849577,0.839124,0.641057


In [17]:
detailed_results.to_csv('../results/detailed_xgboost_results.csv')

In [18]:
summary_results.to_csv('../results/summary_xgboost_results.csv')