## Import Libraries

In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

## Load and Explore Dataset

In [50]:
# Load dataset
df = pd.read_csv('creditcard.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum().sum())

Dataset Shape: (284807, 31)

First few rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -

## Analyze Class Imbalance

In [51]:
print("Class Distribution:")
print(df['Class'].value_counts())
print("\nClass Distribution (%):")
print(df['Class'].value_counts(normalize=True) * 100)

print("\nTransaction Amount Statistics:")
print(df['Amount'].describe())

print("\nFraud vs Legitimate Amount Statistics:")
print("\nFraudulent Transactions:")
print(df[df['Class'] == 1]['Amount'].describe())
print("\nLegitimate Transactions:")
print(df[df['Class'] == 0]['Amount'].describe())

Class Distribution:
Class
0    284315
1       492
Name: count, dtype: int64

Class Distribution (%):
Class
0    99.827251
1     0.172749
Name: proportion, dtype: float64

Transaction Amount Statistics:
count    284807.000000
mean         88.349619
std         250.120109
min           0.000000
25%           5.600000
50%          22.000000
75%          77.165000
max       25691.160000
Name: Amount, dtype: float64

Fraud vs Legitimate Amount Statistics:

Fraudulent Transactions:
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

Legitimate Transactions:
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64


## Data Preprocessing

In [52]:
# Remove any incomplete records
df_clean = df.dropna()
print(f"Records after removing incomplete data: {len(df_clean)}")

# Separate features and target
X = df_clean.drop('Class', axis=1)
Y = df_clean['Class']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {Y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

Records after removing incomplete data: 284807

Features shape: (284807, 30)
Target shape: (284807,)

Feature columns: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']


## Train/Validation/Test Split

In [53]:
# First split: 70% train, 30% temp (for validation + test)
X_train, X_temp, Y_train, Y_temp = train_test_split(
    X, Y, test_size=0.30, random_state=42, stratify=Y
)

# Second split: Split temp into 50% validation, 50% test (15% each of total)
X_val, X_test, Y_val, Y_test = train_test_split(
    X_temp, Y_temp, test_size=0.50, random_state=42, stratify=Y_temp
)

print(f"Training set size: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set size: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set size: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

print(f"\nTraining set - Fraud cases: {Y_train.sum()}, Legitimate cases: {(Y_train==0).sum()}")
print(f"Validation set - Fraud cases: {Y_val.sum()}, Legitimate cases: {(Y_val==0).sum()}")
print(f"Test set - Fraud cases: {Y_test.sum()}, Legitimate cases: {(Y_test==0).sum()}")

Training set size: 199364 (70.0%)
Validation set size: 42721 (15.0%)
Test set size: 42722 (15.0%)

Training set - Fraud cases: 344, Legitimate cases: 199020
Validation set - Fraud cases: 74, Legitimate cases: 42647
Test set - Fraud cases: 74, Legitimate cases: 42648


## Helper Functions

In [54]:
def optimize_threshold(Y_true, Y_pred_proba, thresholds=np.arange(0.1, 0.9, 0.01)):
    best_threshold = 0.5
    best_f1 = 0

    for threshold in thresholds:
        Y_pred = (Y_pred_proba >= threshold).astype(int)
        f1 = f1_score(Y_true, Y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    return best_threshold, best_f1

def evaluate_model(model, X_test, Y_test, threshold=0.5, model_name="Model"):
    Y_pred_proba = model.predict_proba(X_test)[:, 1]
    Y_pred = (Y_pred_proba >= threshold).astype(int)

    acc = accuracy_score(Y_test, Y_pred)
    prec = precision_score(Y_test, Y_pred, zero_division=0)
    rec = recall_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)
    roc_auc = roc_auc_score(Y_test, Y_pred_proba)
    cm = confusion_matrix(Y_test, Y_pred)

    print(f"\n{'='*60}")
    print(f"{model_name}")
    print(f"{'='*60}")
    print(f"Threshold: {threshold:.3f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"\nConfusion Matrix:")
    print(f"TN: {cm[0][0]}, FP: {cm[0][1]}")
    print(f"FN: {cm[1][0]}, TP: {cm[1][1]}")

    return f1

## Find Best Undersampling Ratio on Validation Set

In [55]:
print("Testing different undersampling ratios on validation set...")

undersampling_ratios = [0.5, 1.0]
best_ratio = 1.0
best_val_f1 = 0

for ratio in undersampling_ratios:
    # Apply undersampling with current ratio
    rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
    X_train_under, Y_train_under = rus.fit_resample(X_train, Y_train)

    # Train a quick Random Forest model
    rf_temp = RandomForestClassifier(random_state=42, n_jobs=-1)
    rf_temp.fit(X_train_under, Y_train_under)

    # Predict on validation set
    Y_val_proba = rf_temp.predict_proba(X_val)[:, 1]

    # Optimize threshold on validation set
    threshold, val_f1 = optimize_threshold(Y_val, Y_val_proba)

    print(f"Ratio {ratio:.1f} - Validation F1: {val_f1:.4f} (Threshold: {threshold:.3f})")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_ratio = ratio

print(f"\nBest undersampling ratio: {best_ratio} with validation F1: {best_val_f1:.4f}")

Testing different undersampling ratios on validation set...
Ratio 0.5 - Validation F1: 0.7778 (Threshold: 0.890)
Ratio 1.0 - Validation F1: 0.7134 (Threshold: 0.890)

Best undersampling ratio: 0.5 with validation F1: 0.7778


## Apply Random Undersampling with Best Ratio

In [56]:
rus = RandomUnderSampler(sampling_strategy=best_ratio, random_state=42)
X_train_under, Y_train_under = rus.fit_resample(X_train, Y_train)

print(f"Original training set size: {len(X_train)}")
print(f"Undersampled training set size: {len(X_train_under)}")
print(f"Undersampled - Fraud cases: {Y_train_under.sum()}, Legitimate cases: {(Y_train_under==0).sum()}")

Original training set size: 199364
Undersampled training set size: 1032
Undersampled - Fraud cases: 344, Legitimate cases: 688


## Apply SMOTE Oversampling

In [57]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, Y_train_smote = smote.fit_resample(X_train, Y_train)

print(f"Original training set size: {len(X_train)}")
print(f"SMOTE training set size: {len(X_train_smote)}")
print(f"SMOTE - Fraud cases: {Y_train_smote.sum()}, Legitimate cases: {(Y_train_smote==0).sum()}")

Original training set size: 199364
SMOTE training set size: 398040
SMOTE - Fraud cases: 199020, Legitimate cases: 199020


## Model 1: Logistic Regression with Undersampling

In [58]:
print("\nTraining Logistic Regression with Undersampling...")
lr_under = LogisticRegression(random_state=42, max_iter=1000)
lr_under.fit(X_train_under, Y_train_under)

# Optimize threshold on validation set
Y_val_proba_lr_under = lr_under.predict_proba(X_val)[:, 1]
threshold_lr_under, _ = optimize_threshold(Y_val, Y_val_proba_lr_under)

print(f"Optimal threshold found on validation set: {threshold_lr_under:.3f}")

# Evaluate on test set with optimized threshold
f1_lr_under = evaluate_model(lr_under, X_test, Y_test, threshold_lr_under,
                              "Logistic Regression + Undersampling (Test Set)")


Training Logistic Regression with Undersampling...
Optimal threshold found on validation set: 0.890

Logistic Regression + Undersampling (Test Set)
Threshold: 0.890
Accuracy: 0.9962
Precision: 0.2897
Recall: 0.8378
F1-Score: 0.4306
ROC-AUC: 0.9683

Confusion Matrix:
TN: 42496, FP: 152
FN: 12, TP: 62


## Model 2: Logistic Regression with SMOTE

In [59]:
print("\nTraining Logistic Regression with SMOTE...")
lr_smote = LogisticRegression(random_state=42, max_iter=1000)
lr_smote.fit(X_train_smote, Y_train_smote)

# Optimize threshold on validation set
y_val_proba_lr_smote = lr_smote.predict_proba(X_val)[:, 1]
threshold_lr_smote, _ = optimize_threshold(Y_val, y_val_proba_lr_smote)

print(f"Optimal threshold found on validation set: {threshold_lr_smote:.3f}")

# Evaluate on test set with optimized threshold
f1_lr_smote = evaluate_model(lr_smote, X_test, Y_test, threshold_lr_smote,
                             "Logistic Regression + SMOTE (Test Set)")


Training Logistic Regression with SMOTE...
Optimal threshold found on validation set: 0.890

Logistic Regression + SMOTE (Test Set)
Threshold: 0.890
Accuracy: 0.9968
Precision: 0.3333
Recall: 0.8378
F1-Score: 0.4769
ROC-AUC: 0.9625

Confusion Matrix:
TN: 42524, FP: 124
FN: 12, TP: 62


## Model 3: Random Forest with Undersampling

In [60]:
print("\nTraining Random Forest with Undersampling...")
rf_under = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_under.fit(X_train_under, Y_train_under)

# Optimize threshold on validation set
Y_val_proba_rf_under = rf_under.predict_proba(X_val)[:, 1]
threshold_rf_under, _ = optimize_threshold(Y_val, Y_val_proba_rf_under)

print(f"Optimal threshold found on validation set: {threshold_rf_under:.3f}")

# Evaluate on test set with optimized threshold
f1_rf_under = evaluate_model(rf_under, X_test, Y_test, threshold_rf_under,
                             "Random Forest + Undersampling (Test Set)")


Training Random Forest with Undersampling...
Optimal threshold found on validation set: 0.890

Random Forest + Undersampling (Test Set)
Threshold: 0.890
Accuracy: 0.9994
Precision: 0.8871
Recall: 0.7432
F1-Score: 0.8088
ROC-AUC: 0.9724

Confusion Matrix:
TN: 42641, FP: 7
FN: 19, TP: 55


## Model 4: Random Forest with SMOTE

In [61]:
print("\nTraining Random Forest with SMOTE...")
rf_smote = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_smote.fit(X_train_smote, Y_train_smote)

# Optimize threshold on validation set
Y_val_proba_rf_smote = rf_smote.predict_proba(X_val)[:, 1]
threshold_rf_smote, _ = optimize_threshold(Y_val, Y_val_proba_rf_smote)

print(f"Optimal threshold found on validation set: {threshold_rf_smote:.3f}")

# Evaluate on test set with optimized threshold
f1_rf_smote = evaluate_model(rf_smote, X_test, Y_test, threshold_rf_smote,
                             "Random Forest + SMOTE (Test Set)")


Training Random Forest with SMOTE...
Optimal threshold found on validation set: 0.670

Random Forest + SMOTE (Test Set)
Threshold: 0.670
Accuracy: 0.9996
Precision: 0.9516
Recall: 0.7973
F1-Score: 0.8676
ROC-AUC: 0.9455

Confusion Matrix:
TN: 42645, FP: 3
FN: 15, TP: 59


## Model Comparison and Best Model Selection

In [62]:
print("\n" + "="*60)
print("FINAL MODEL COMPARISON (Test Set F1-Scores)")
print("="*60)

results = {
    "Logistic Regression + Undersampling": f1_lr_under,
    "Logistic Regression + SMOTE": f1_lr_smote,
    "Random Forest + Undersampling": f1_rf_under,
    "Random Forest + SMOTE": f1_rf_smote
}

for model_name, f1 in results.items():
    print(f"{model_name}: {f1:.4f}")

best_model_name = max(results, key=results.get)
best_f1 = results[best_model_name]

print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model_name}")
print(f"Best F1-Score: {best_f1:.4f}")
print(f"{'='*60}")


FINAL MODEL COMPARISON (Test Set F1-Scores)
Logistic Regression + Undersampling: 0.4306
Logistic Regression + SMOTE: 0.4769
Random Forest + Undersampling: 0.8088
Random Forest + SMOTE: 0.8676

BEST MODEL: Random Forest + SMOTE
Best F1-Score: 0.8676

âœ“ Target F1-Score (>0.85) ACHIEVED!


## Feature Importance Analysis (Random Forest Models)

In [63]:
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

# Get feature importance from best Random Forest model
if "Random Forest + Undersampling" == best_model_name:
    feature_importances = rf_under.feature_importances_
elif "Random Forest + SMOTE" == best_model_name:
    feature_importances = rf_smote.feature_importances_
else:
    # Default to RF with undersampling if best model is LR
    feature_importances = rf_under.feature_importances_

# Create feature importance dataframe
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_df.head(10).to_string(index=False))


FEATURE IMPORTANCE ANALYSIS

Top 10 Most Important Features:
Feature  Importance
    V14    0.209241
    V10    0.128348
     V4    0.121234
    V12    0.098778
    V17    0.088056
     V3    0.077706
    V11    0.049551
    V16    0.044210
     V2    0.038249
     V9    0.027830
