In [36]:
import os
import sys
import pandas as pd

# 1️⃣ Add the project root (one level above /notebooks) to the very beginning of sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# 2️⃣ Import after the path has been updated
from ccfd_utils.preprocessing import preprocess_data

# 3️⃣ Load and preprocess the data
df = pd.read_csv('../data/creditcard.csv')
X_train, X_test, y_train, y_test = preprocess_data(df)


LOGISTIC REGRESSION

To establish a baseline, a Logistic Regression classifier was trained on the preprocessed dataset using class weighting to address the imbalance between fraudulent and non-fraudulent transactions. The model achieved a high overall accuracy of ~98%, however this number is misleading due to the severe class imbalance. More importantly, it achieved a recall of 0.92 for the fraud class, meaning it was able to correctly detect most fraudulent transactions.
On the other hand, the precision for the fraud class was only 0.06, indicating a large number of false positives.
This makes Logistic Regression a good baseline for sensitivity (recall), but further models are necessary to improve precision without significantly reducing recall.

In [40]:
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, lr_pred))
print(confusion_matrix(y_test, lr_pred))



Logistic Regression:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962

[[55478  1386]
 [    8    90]]


DECISION TREES


In [41]:
from sklearn.tree import DecisionTreeClassifier
# Decision Tree Model
dt_model = DecisionTreeClassifier(class_weight='balanced')
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
print("Decision Tree:\n", classification_report(y_test, dt_pred))
print(confusion_matrix(y_test, dt_pred))



Decision Tree:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.72      0.71      0.72        98

    accuracy                           1.00     56962
   macro avg       0.86      0.86      0.86     56962
weighted avg       1.00      1.00      1.00     56962

[[56837    27]
 [   28    70]]
