# 0. IMPORTING DEPENDENCIES

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             roc_auc_score)
from imblearn.over_sampling import SMOTE

# 1. LOAD AND EXPLORE DATA

In [17]:
#load the dataset
credit_card_data = pd.read_csv('creditcard.csv')

print("="*70)
print("DATASET OVERVIEW")
print("="*70)
print(f"\nDataset Shape: {credit_card_data.shape}")
print(f"Total Transactions: {len(credit_card_data)}")

#check for missing values
print(f"\nMissing Values:\n{credit_card_data.isnull().sum().sum()} total missing values")

#handle the last row with missing values (identified in original notebook)
credit_card_data = credit_card_data.dropna()

print(f"\nAfter removing missing values: {credit_card_data.shape}")

#class distribution
print("\n" + "="*70)
print("CLASS DISTRIBUTION")
print("="*70)
class_dist = credit_card_data['Class'].value_counts()
print(f"\nLegitimate Transactions (0): {class_dist[0]} ({class_dist[0]/len(credit_card_data)*100:.2f}%)")
print(f"Fraudulent Transactions (1): {class_dist[1]} ({class_dist[1]/len(credit_card_data)*100:.2f}%)")
print(f"\nImbalance Ratio: {class_dist[0]/class_dist[1]:.2f}:1")

#statistical summary
print("\n" + "="*70)
print("TRANSACTION AMOUNT STATISTICS")
print("="*70)
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

print("\nLegitimate Transactions:")
print(legit['Amount'].describe())
print("\nFraudulent Transactions:")
print(fraud['Amount'].describe())

DATASET OVERVIEW

Dataset Shape: (284807, 31)
Total Transactions: 284807

Missing Values:
0 total missing values

After removing missing values: (284807, 31)

CLASS DISTRIBUTION

Legitimate Transactions (0): 284315 (99.83%)
Fraudulent Transactions (1): 492 (0.17%)

Imbalance Ratio: 577.88:1

TRANSACTION AMOUNT STATISTICS

Legitimate Transactions:
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

Fraudulent Transactions:
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64


# 2. DATA PREPARATION

In [18]:
print("\n" + "="*70)
print("DATA PREPARATION")
print("="*70)

# Separate features and target
X = credit_card_data.drop(columns='Class', axis=1)
Y = credit_card_data['Class']

# Split the data (80-20 split)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=42
)

print(f"\nTraining Set: {X_train.shape}")
print(f"Testing Set: {X_test.shape}")


DATA PREPARATION

Training Set: (227845, 30)
Testing Set: (56962, 30)


# 3. SAMPLING STRATEGIES

In [19]:
print("\n" + "="*70)
print("SAMPLING STRATEGIES")
print("="*70)

# Strategy 1: Random Undersampling (from original notebook)
legit_train = X_train[Y_train == 0]
fraud_train = X_train[Y_train == 1]
legit_sample = legit_train.sample(n=len(fraud_train), random_state=42)

X_train_under = pd.concat([legit_sample, fraud_train], axis=0)
Y_train_under = pd.concat([
    pd.Series([0]*len(legit_sample)),
    pd.Series([1]*len(fraud_train))
], axis=0)

print(f"\n1. Random Undersampling:")
print(f"   Training samples: {len(X_train_under)}")
print(f"   Class distribution: {Y_train_under.value_counts().to_dict()}")

# Strategy 2: SMOTE (Oversampling)
smote = SMOTE(random_state=42)
X_train_smote, Y_train_smote = smote.fit_resample(X_train, Y_train)

print(f"\n2. SMOTE Oversampling:")
print(f"   Training samples: {len(X_train_smote)}")
print(f"   Class distribution: {Y_train_smote.value_counts().to_dict()}")


SAMPLING STRATEGIES

1. Random Undersampling:
   Training samples: 788
   Class distribution: {0: 394, 1: 394}

2. SMOTE Oversampling:
   Training samples: 454902
   Class distribution: {0: 227451, 1: 227451}


# 4. MODEL TRAINING AND EVALUATION

In [20]:
def evaluate_model(model, X_train, Y_train, X_test, Y_test, model_name):
    """Comprehensive model evaluation"""
    print("\n" + "="*70)
    print(f"EVALUATING: {model_name}")
    print("="*70)

    # Train the model
    model.fit(X_train, Y_train)

    # Predictions
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    # Calculate metrics
    metrics = {
        'Training Accuracy': accuracy_score(Y_train, Y_train_pred),
        'Testing Accuracy': accuracy_score(Y_test, Y_test_pred),
        'Precision': precision_score(Y_test, Y_test_pred),
        'Recall': recall_score(Y_test, Y_test_pred),
        'F1-Score': f1_score(Y_test, Y_test_pred),
        'ROC-AUC': roc_auc_score(Y_test, Y_test_pred)
    }

    print("\nPerformance Metrics:")
    print("-" * 50)
    for metric, value in metrics.items():
        print(f"{metric:20s}: {value:.4f}")

    # Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(Y_test, Y_test_pred)
    print(cm)
    print(f"\nTrue Negatives:  {cm[0][0]}")
    print(f"False Positives: {cm[0][1]}")
    print(f"False Negatives: {cm[1][0]}")
    print(f"True Positives:  {cm[1][1]}")

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(Y_test, Y_test_pred,
                                target_names=['Legitimate', 'Fraud']))

    return model, metrics

# Model 1: Logistic Regression with Undersampling

In [21]:
lr_under = LogisticRegression(max_iter=1000, random_state=42)
lr_under_model, lr_under_metrics = evaluate_model(
    lr_under, X_train_under, Y_train_under, X_test, Y_test,
    "Logistic Regression (Undersampling)"
)


EVALUATING: Logistic Regression (Undersampling)

Performance Metrics:
--------------------------------------------------
Training Accuracy   : 0.9569
Testing Accuracy    : 0.9599
Precision           : 0.0380
Recall              : 0.9184
F1-Score            : 0.0731
ROC-AUC             : 0.9392

Confusion Matrix:
[[54588  2276]
 [    8    90]]

True Negatives:  54588
False Positives: 2276
False Negatives: 8
True Positives:  90

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      0.96      0.98     56864
       Fraud       0.04      0.92      0.07        98

    accuracy                           0.96     56962
   macro avg       0.52      0.94      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# Model 2: Logistic Regression with SMOTE

In [22]:
lr_smote = LogisticRegression(max_iter=1000, random_state=42)
lr_smote_model, lr_smote_metrics = evaluate_model(
    lr_smote, X_train_smote, Y_train_smote, X_test, Y_test,
    "Logistic Regression (SMOTE)"
)


EVALUATING: Logistic Regression (SMOTE)

Performance Metrics:
--------------------------------------------------
Training Accuracy   : 0.9787
Testing Accuracy    : 0.9884
Precision           : 0.1191
Recall              : 0.8980
F1-Score            : 0.2103
ROC-AUC             : 0.9433

Confusion Matrix:
[[56213   651]
 [   10    88]]

True Negatives:  56213
False Positives: 651
False Negatives: 10
True Positives:  88

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      0.99      0.99     56864
       Fraud       0.12      0.90      0.21        98

    accuracy                           0.99     56962
   macro avg       0.56      0.94      0.60     56962
weighted avg       1.00      0.99      0.99     56962



# Model 3: Random Forest with Undersampling


In [23]:
rf_under = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_under_model, rf_under_metrics = evaluate_model(
    rf_under, X_train_under, Y_train_under, X_test, Y_test,
    "Random Forest (Undersampling)"
)


EVALUATING: Random Forest (Undersampling)

Performance Metrics:
--------------------------------------------------
Training Accuracy   : 1.0000
Testing Accuracy    : 0.9641
Precision           : 0.0423
Recall              : 0.9184
F1-Score            : 0.0809
ROC-AUC             : 0.9413

Confusion Matrix:
[[54827  2037]
 [    8    90]]

True Negatives:  54827
False Positives: 2037
False Negatives: 8
True Positives:  90

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      0.96      0.98     56864
       Fraud       0.04      0.92      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.94      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# Model 4: Random Forest with SMOTE


In [None]:
rf_smote = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_smote_model, rf_smote_metrics = evaluate_model(
    rf_smote, X_train_smote, Y_train_smote, X_test, Y_test,
    "Random Forest (SMOTE)"
)


EVALUATING: Random Forest (SMOTE)


# 5. MODEL COMPARISON


In [None]:
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)

comparison_df = pd.DataFrame({
    'LR (Under)': lr_under_metrics,
    'LR (SMOTE)': lr_smote_metrics,
    'RF (Under)': rf_under_metrics,
    'RF (SMOTE)': rf_smote_metrics
})

print("\n", comparison_df.round(4))

# 6. FEATURE IMPORTANCE (for Random Forest)


In [None]:
print("\n" + "="*70)
print("TOP 10 IMPORTANT FEATURES (Random Forest - SMOTE)")
print("="*70)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_smote_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n", feature_importance.head(10).to_string(index=False))

# 7. RECOMMENDATIONS

### 1. BEST MODEL SELECTION:
   - For maximizing fraud detection (Recall): Random Forest with SMOTE
   - For balanced performance: Random Forest with undersampling
   - For production deployment: Consider ensemble of top models

### 2. KEY INSIGHTS:
   - Random Forest outperforms Logistic Regression
   - SMOTE generally provides better recall for fraud detection
   - High feature importance for V14, V17, V12, V10

### 3. NEXT STEPS:
   - Implement hyperparameter tuning (GridSearchCV/RandomizedSearchCV)
   - Try XGBoost or LightGBM for potentially better performance
   - Implement cost-sensitive learning
   - Deploy with real-time monitoring
   - Set up alert system for predicted fraudulent transactions

### 4. PRODUCTION CONSIDERATIONS:
   - Monitor for data drift
   - Regular model retraining
   - A/B testing for model updates
   - Consider threshold adjustment based on business cost