In [None]:
# ! pip install scikit-learn==1.5.2

In [None]:
# ! pip install imbalanced-learn 

In [None]:
# ! pip install optuna

In [None]:
# %load_ext cuml.accel

In [None]:
# import sklearn
# print(sklearn.__version__)

In [None]:
# import imblearn
# print(imblearn.__version__)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import(confusion_matrix,
                            recall_score,
                            precision_score,
                            f1_score,
                            accuracy_score,
                            average_precision_score, 
                            make_scorer,
                            fbeta_score)
import optuna
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
import pickle

In [2]:
df = pd.read_csv('../data/creditcard.csv')

In [3]:
df.drop_duplicates(inplace=True)

In [4]:
X = df.drop(columns=['Time', 'Class'], axis=1)
y = df['Class']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.3,
                                                    random_state=42)

In [6]:
scaler = StandardScaler()
X_train['Amount'] = scaler.fit_transform(X_train['Amount'].values.reshape(-1, 1))
X_test['Amount'] = scaler.transform(X_test['Amount'].values.reshape(-1, 1))

In [7]:
# X_test.to_csv('../data/X_test.csv', index=False)
# y_test.to_csv('../data/y_test.csv', index=False)

In [8]:
kf = StratifiedKFold(n_splits=5, shuffle=False)

In [9]:
def create_sampler(name):
    if name == 'smote':
        return SMOTE()
    elif name == 'adasyn':
        return ADASYN()
    elif name == 'ros':
        return RandomOverSampler()
    return None

In [16]:
def objective(trial):
    sampler_name = trial.suggest_categorical('sampler', ['smote', 'adasyn', 'ros'])
    sampler = create_sampler(sampler_name)
    
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 100.0),  # Important for imbalance
        'eval_metric': 'aucpr'
    }

    model =  XGBClassifier(**param,  n_jobs=-1, random_state=42)
    
    pipeline = Pipeline([
        ('sampler', sampler),
        ('classifier', model)
    ])
    
    # ap_scorer = make_scorer(average_precision_score, needs_proba=True)
    f2_scorer = make_scorer(fbeta_score, beta=2)
    score = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring=f2_scorer, n_jobs=-1)
    
    return np.mean(score)

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True, n_jobs=-1)

[I 2025-04-23 21:46:11,019] A new study created in memory with name: no-name-5ce8c59b-a546-4e5a-b153-d6390d3a57f8


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-23 21:46:40,164] Trial 11 finished with value: 0.7047109005303214 and parameters: {'sampler': 'smote', 'n_estimators': 68, 'max_depth': 7, 'learning_rate': 0.2183500822960661, 'subsample': 0.7155427254113613, 'colsample_bytree': 0.773045145968121, 'scale_pos_weight': 78.44724359570425}. Best is trial 11 with value: 0.7047109005303214.
[I 2025-04-23 21:47:03,919] Trial 14 finished with value: 0.7521565217484111 and parameters: {'sampler': 'adasyn', 'n_estimators': 62, 'max_depth': 9, 'learning_rate': 0.27131119392321634, 'subsample': 0.7515496626265852, 'colsample_bytree': 0.92192418102339, 'scale_pos_weight': 82.25186316963257}. Best is trial 14 with value: 0.7521565217484111.
[I 2025-04-23 21:47:17,485] Trial 7 finished with value: 0.05658519045747858 and parameters: {'sampler': 'smote', 'n_estimators': 68, 'max_depth': 3, 'learning_rate': 0.10202449792347754, 'subsample': 0.8421337808136082, 'colsample_bytree': 0.8342033722901179, 'scale_pos_weight': 70.15027387574595}. Be

In [18]:
study.best_trial.params

{'sampler': 'ros',
 'n_estimators': 223,
 'max_depth': 8,
 'learning_rate': 0.179168559852206,
 'subsample': 0.7811228540129369,
 'colsample_bytree': 0.7127860643930117,
 'scale_pos_weight': 70.40383670651536}

In [19]:
best_sampler = study.best_trial.params.get('sampler')
best_param = {k: v for k, v in study.best_trial.params.items() if k != 'sampler'}

In [20]:
best_sampler

'ros'

In [21]:
best_param

{'n_estimators': 223,
 'max_depth': 8,
 'learning_rate': 0.179168559852206,
 'subsample': 0.7811228540129369,
 'colsample_bytree': 0.7127860643930117,
 'scale_pos_weight': 70.40383670651536}

In [24]:
sampler = create_sampler(best_sampler)
model =  XGBClassifier(**best_param, n_jobs=-1, random_state=42)

pipeline = Pipeline([
    ('sampler', sampler),
    ('classifier', model)
])

In [25]:
pipeline.fit(X_train, y_train)

In [26]:
y_pred = pipeline.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
recall = recall_score(y_test, y_pred, zero_division=0)
precision = precision_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
accuracy = accuracy_score(y_test, y_pred)

print(f'{cm}\n')

df = [(recall, precision, f1, accuracy)]

score = pd.DataFrame(df, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
score.insert(0, 'XGBoost with', best_sampler)

score

[[84962    14]
 [   34   108]]



Unnamed: 0,XGBoost with,Recall,Precision,F1 Score,Accuracy
0,ros,0.760563,0.885246,0.818182,0.999436


In [27]:
pipeline.named_steps['classifier'].save_model('../models/xgboost_ros.json')
