In [None]:
# ! pip install scikit-learn==1.5.2

In [None]:
# ! pip install imbalanced-learn 

In [2]:
%load_ext cuml.accel

[2025-04-23 20:45:16.207] [CUML] [info] cuML: Installed accelerator for sklearn.
[2025-04-23 20:45:16.208] [CUML] [info] cuML: Successfully initialized accelerator.


In [3]:
# import sklearn
# print(sklearn.__version__)

In [4]:
# import imblearn
# print(imblearn.__version__)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import(confusion_matrix,
                            recall_score,
                            precision_score,
                            f1_score,
                            accuracy_score,
                            average_precision_score, 
                            make_scorer,
                            fbeta_score)
import optuna
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
import pickle

In [48]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

In [49]:
df.drop_duplicates(inplace=True)

In [50]:
X = df.drop(columns=['Time', 'Class'], axis=1)
y = df['Class']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.3,
                                                    random_state=42)

In [52]:
def standardize(df, col_names):
    scaler = StandardScaler()
    df[col_names] = scaler.fit_transform(df[col_names])
    return df

In [53]:
col_names = ['Amount']
X_train = standardize(X_train, col_names)

In [54]:
kf = StratifiedKFold(n_splits=5, shuffle=False)

In [55]:
kf = StratifiedKFold(n_splits=5, shuffle=False)

In [56]:
def create_sampler(name):
    if name == 'smote':
        return SMOTE()
    elif name == 'adasyn':
        return ADASYN()
    elif name == 'ros':
        return RandomOverSampler()
    return None

In [71]:
def objective(trial):
    sampler_name = trial.suggest_categorical('sampler', ['smote', 'adasyn', 'ros'])
    sampler = create_sampler(sampler_name)
    
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 100.0),  # Important for imbalance
        'use_label_encoder': False,
        'eval_metric': 'aucpr'
    }

    model =  XGBClassifier(**param,  device='cuda', random_state=42)
    
    pipeline = Pipeline([
        ('sampler', sampler),
        ('classifier', model)
    ])
    
    # ap_scorer = make_scorer(average_precision_score, needs_proba=True)
    f2_scorer = make_scorer(fbeta_score, beta=2)
    score = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring=f2_scorer)
    
    return np.mean(score)

In [72]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2025-04-23 22:08:05,107] A new study created in memory with name: no-name-0b1f1413-8fbe-4193-a8b9-75a05d58735d


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-23 22:08:17,257] Trial 0 finished with value: 0.3581652738593869 and parameters: {'sampler': 'ros', 'n_estimators': 274, 'max_depth': 6, 'learning_rate': 0.013436424119979884, 'subsample': 0.7617946451076392, 'colsample_bytree': 0.6578953473870924, 'scale_pos_weight': 44.013793189949375}. Best is trial 0 with value: 0.3581652738593869.
[I 2025-04-23 22:08:29,857] Trial 1 finished with value: 0.7950043478006262 and parameters: {'sampler': 'adasyn', 'n_estimators': 287, 'max_depth': 6, 'learning_rate': 0.2134842815914033, 'subsample': 0.9278158076279701, 'colsample_bytree': 0.7196270137065284, 'scale_pos_weight': 65.34008344139994}. Best is trial 1 with value: 0.7950043478006262.
[I 2025-04-23 22:08:40,533] Trial 2 finished with value: 0.8131966187647756 and parameters: {'sampler': 'ros', 'n_estimators': 254, 'max_depth': 5, 'learning_rate': 0.06419583466448442, 'subsample': 0.8276569813608401, 'colsample_bytree': 0.7358475702343159, 'scale_pos_weight': 61.27167364959251}. Bes

In [75]:
study.best_trial.params

{'sampler': 'ros',
 'n_estimators': 200,
 'max_depth': 10,
 'learning_rate': 0.10609394911963685,
 'subsample': 0.9937061428620857,
 'colsample_bytree': 0.791039998596758,
 'scale_pos_weight': 90.10453740110084}

In [76]:
best_sampler = study.best_trial.params.get('sampler')
best_param = {k: v for k, v in study.best_trial.params.items() if k != 'sampler'}

In [77]:
best_sampler

'ros'

In [78]:
best_param

{'n_estimators': 200,
 'max_depth': 10,
 'learning_rate': 0.10609394911963685,
 'subsample': 0.9937061428620857,
 'colsample_bytree': 0.791039998596758,
 'scale_pos_weight': 90.10453740110084}

In [79]:
sampler = create_sampler(best_sampler)
model =  XGBClassifier(**best_param, device='cuda', random_state=42)

pipeline = Pipeline([
    ('sampler', sampler),
    ('classifier', model)
])

In [80]:
pipeline.fit(X_train, y_train)

In [81]:
y_pred = pipeline.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
recall = recall_score(y_test, y_pred, zero_division=0)
precision = precision_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
accuracy = accuracy_score(y_test, y_pred)

print(f'{cm}\n')

df = [(recall, precision, f1, accuracy)]

score = pd.DataFrame(df, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
score.insert(0, 'XGBoost with', best_sampler)

score

[[84959    17]
 [   31   111]]



Unnamed: 0,XGBoost with,Recall,Precision,F1 Score,Accuracy
0,ros,0.78169,0.867188,0.822222,0.999436


In [None]:
trained_model = pipeline.named_steps['classifier']

In [None]:
with open('trained_model_xgboost_ros.pkl', 'wb') as f:
    pickle.dump(trained_model, f)