In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
import shap
import joblib

In [None]:
df = pd.read_csv('../data/creditcard.csv')
print(df.head())
print("\nClass Distribution:")
print(df['Class'].value_counts())

# Plot class distribution
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: No Fraud, 1: Fraud)')
plt.show()

In [None]:
# Scale Amount and create hour_of_day feature
scaler = RobustScaler()
df['scaled_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['hour_of_day'] = (df['Time'] // 3600) % 24

# Drop original Time and Amount columns
df.drop(['Time','Amount'], axis=1, inplace=True)

# Reorder columns
scaled_amount = df['scaled_amount']
hour_of_day = df['hour_of_day']
df.drop(['scaled_amount', 'hour_of_day'], axis=1, inplace=True)
df.insert(0, 'hour_of_day', hour_of_day)
df.insert(0, 'scaled_amount', scaled_amount)

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

# Split data BEFORE applying SMOTE to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

In [None]:
lgbm = LGBMClassifier(objective='binary', metric='auc', n_estimators=1000, random_state=42)
lgbm.fit(X_train_res, y_train_res)
lgbm_preds = lgbm.predict(X_test)
lgbm_proba = lgbm.predict_proba(X_test)[:, 1]

print("--- LightGBM Evaluation ---")
print(classification_report(y_test, lgbm_preds))
print(f"AUC-PR: {average_precision_score(y_test, lgbm_proba):.4f}")

In [None]:
cm = confusion_matrix(y_test, lgbm_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Fraud', 'Fraud'], yticklabels=['No Fraud', 'Fraud'])
plt.title('LightGBM Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values[1], X_test, plot_type="bar", show=True, plot_size=(10,8))