In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import classification_report, average_precision_score
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# 1. Load the Dataset
# Please make sure 'creditcard.csv' is in the same directory as your notebook.
try:
    data = pd.read_csv('Desktop/creditcard.csv')
    print("Dataset loaded successfully.")
    print(f"Original data shape: {data.shape}")
except FileNotFoundError:
    print("Error: 'creditcard.csv' not found. Please upload the dataset.")
    exit()

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Check the original class distribution
print(f"Original class distribution: {Counter(y)}")
print("---")

# 2. Preprocessing
# Scale the 'Time' and 'Amount' features as they are not pre-scaled like V1-V28
scaler = StandardScaler()
X['Time'] = scaler.fit_transform(X[['Time']])
X['Amount'] = scaler.fit_transform(X[['Amount']])

# Split the data into training and testing sets
# We use 'stratify=y' to ensure the train and test sets have a similar proportion of fraudulent transactions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print("---")

# 3. Handle Imbalanced Data using SMOTE
# SMOTE will over-sample the minority class (fraudulent transactions)
# to balance the training data, making the model more effective at detecting fraud.
print("Applying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"SMOTE-balanced training data shape: {X_train_smote.shape}")
print(f"SMOTE-balanced class distribution: {Counter(y_train_smote)}")
print("---")

# 4. Train the Model (LightGBM)
# LightGBM is a high-performance gradient boosting framework.
print("Training LightGBM model...")
lgbm_model = lgb.LGBMClassifier(random_state=42, n_estimators=200, learning_rate=0.05)
lgbm_model.fit(X_train_smote, y_train_smote)
print("Model training complete.")
print("---")

# 5. Model Evaluation
# Make predictions on the unseen test data.
# We predict probabilities to calculate the Average Precision Score.
y_pred = lgbm_model.predict(X_test)
y_pred_proba = lgbm_model.predict_proba(X_test)[:, 1]

# Display the classification report
# 'Recall' for the '1' class (fraud) is crucial, as it shows how many fraudulent transactions we caught.
print("Classification Report on Test Data:")
print(classification_report(y_test, y_pred))
print("---")

# Calculate and display the Average Precision Score (AUPRC)
# AUPRC is a reliable metric for imbalanced data. A higher value (closer to 1) is better.
average_precision = average_precision_score(y_test, y_pred_proba)
print(f"Average Precision Score (AUPRC): {average_precision:.4f}")

Dataset loaded successfully.
Original data shape: (284807, 31)
Original class distribution: Counter({0: 284315, 1: 492})
---
Training data shape: (227845, 30)
Testing data shape: (56962, 30)
---
Applying SMOTE to balance the training data...
SMOTE-balanced training data shape: (454902, 30)
SMOTE-balanced class distribution: Counter({0: 227451, 1: 227451})
---
Training LightGBM model...
[LightGBM] [Info] Number of positive: 227451, number of negative: 227451
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 454902, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Model training complete.
---
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     5