In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install scikit-optimize



In [3]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



from skopt import BayesSearchCV
from skopt.space import Real, Categorical



In [4]:
RANDOM_STATE = 42
CSV_PATH     = "/content/drive/MyDrive/441/creditcard.csv"     # adapt if needed

Feature Engineering

In [5]:
df = (
    pd.read_csv(CSV_PATH)
      .sort_values("Time")
      .reset_index(drop=True)
)

In [6]:
df["Hour"] = ((df["Time"] // 3600) % 24).astype(np.int8)

# Time since previous recorded transaction
df["TimeSincePrev"] = df["Time"].diff().fillna(df["Time"]).astype(np.float32)

# Binary “night time” flag (00:00–06:00)
df["IsNight"] = (df["Hour"] < 6).astype(np.int8)

In [7]:
X = df.drop(columns="Class")
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [8]:

base_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        max_iter=1000,
        solver="liblinear",
        class_weight="balanced",
        random_state=RANDOM_STATE,
    ))
])

In [9]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

def cv_auc(pipe, name):
    scores = cross_val_score(pipe, X_train, y_train,
                             scoring="roc_auc", cv=cv, n_jobs=-1)
    print(f"{name:12s} – mean AUC: {scores.mean():.4f} ± {scores.std():.4f}")

cv_auc(base_pipe,  "ClassWeight")

ClassWeight  – mean AUC: 0.9809 ± 0.0057


In [10]:
search_space = {
    "logreg__C": Real(1e-3, 1e+2, prior="log-uniform"),
    "logreg__penalty": Categorical(["l1", "l2"]),
}

bayes_tuner = BayesSearchCV(
    estimator=base_pipe,
    search_spaces=search_space,
    scoring="roc_auc",
    cv=cv,
    n_iter=12,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=10,
)

bayes_tuner.fit(X_train, y_train)
print("Best AUC (cv):", bayes_tuner.best_score_)
print("Best params :", bayes_tuner.best_params_)
best_model = bayes_tuner.best_estimator_

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best AUC (cv): 0.9829275504080858
Best params : OrderedDict([('logreg__C', 0.0010426811836320535), ('logreg__penalty', 'l2')])


In [11]:
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)        # tune threshold later

print("Test AUC :", roc_auc_score(y_test, y_prob))
print("Report   :\n", classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Test AUC : 0.9759914095069655
Report   :
               precision    recall  f1-score   support

           0     0.9999    0.9783    0.9890     56864
           1     0.0680    0.9184    0.1266        98

    accuracy                         0.9782     56962
   macro avg     0.5339    0.9483    0.5578     56962
weighted avg     0.9983    0.9782    0.9875     56962

Confusion matrix:
 [[55630  1234]
 [    8    90]]


In [12]:
from sklearn.metrics import precision_recall_curve
y_prob_test = best_model.predict_proba(X_test)[:, 1]
prec, rec, thresh = precision_recall_curve(y_test, y_prob_test)

target_precision = 0.90
idx       = np.where(prec >= target_precision)[0][0]
cutoff    = thresh[idx]

print(f"Cutoff chosen = {cutoff:.3f}  "
      f"(precision={prec[idx]:.3f}, recall={rec[idx]:.3f})")

Cutoff chosen = 1.000  (precision=0.900, recall=0.276)


In [13]:
y_pred_adj = (y_prob_test >= cutoff).astype(int)

print(classification_report(y_test, y_pred_adj, digits=4))
print(confusion_matrix(y_test, y_pred_adj))

              precision    recall  f1-score   support

           0     0.9988    0.9999    0.9993     56864
           1     0.9000    0.2755    0.4219        98

    accuracy                         0.9987     56962
   macro avg     0.9494    0.6377    0.7106     56962
weighted avg     0.9986    0.9987    0.9984     56962

[[56861     3]
 [   71    27]]


In [14]:
import joblib
from pathlib import Path

model_dir = Path("/content/drive/MyDrive/Model")


to_save = {
    "model" : best_model,   # full Pipeline returned by BayesSearchCV
    "cutoff": cutoff
    }


joblib.dump(to_save, model_dir / "fraud_logreg.pkl", compress=3)

print("✔ Model & threshold saved to", model_dir / "fraud_logreg.pkl")

✔ Model & threshold saved to /content/drive/MyDrive/Model/fraud_logreg.pkl
