In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, auc, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src') 

from utils.data_loader import load_data

In [5]:
fraud_data= load_data('../data/processed/fraud_data_processed.csv')
creditcard_data= load_data('../data/processed/creditcard_processed.csv')

Data loaded successfully from ../data/processed/fraud_data_processed.csv
Data loaded successfully from ../data/processed/creditcard_processed.csv


In [6]:
def prepare_data(df, target_col):
    """Prepare features and target, perform train-test split"""
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
def evaluate_model(y_true, y_pred, y_pred_proba, model_name, dataset_name):
    """Evaluate model using AUC-PR, F1-Score, and Confusion Matrix"""
    # Calculate AUC-PR
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    auc_pr = auc(recall, precision)
    
    # Calculate F1-Score
    f1 = f1_score(y_true, y_pred)
    
    # Calculate Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    
    print(f"\n{model_name} Results on {dataset_name}:")
    print(f"AUC-PR: {auc_pr:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    
    # Plot Confusion Matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name} ({dataset_name})')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return auc_pr, f1



In [9]:
def train_and_evaluate(dataset, target_col, dataset_name):
    """Train and evaluate both models on the dataset"""
    X_train, X_test, y_train, y_test = prepare_data(dataset, target_col)
    
    # Initialize models
    lr_model = LogisticRegression(random_state=42, max_iter=1000)
    rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
    
    results = {}
    
    # Train and evaluate Logistic Regression
    lr_model.fit(X_train, y_train)
    lr_pred = lr_model.predict(X_test)
    lr_pred_proba = lr_model.predict_proba(X_test)[:, 1]
    results['Logistic Regression'] = evaluate_model(y_test, lr_pred, lr_pred_proba, 
                                                  'Logistic Regression', dataset_name)
    
    # Train and evaluate Random Forest
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    results['Random Forest'] = evaluate_model(y_test, rf_pred, rf_pred_proba, 
                                            'Random Forest', dataset_name)
    
    return results

# Process both datasets
fraud_results = train_and_evaluate(fraud_data, 'class', 'Fraud_Data')
creditcard_results = train_and_evaluate(creditcard_data, 'Class', 'creditcard')

ValueError: could not convert string to float: '2015-07-17 16:58:56'

In [None]:
# Compare models and select best
def compare_models(fraud_results, creditcard_results):
    print("\nModel Comparison Summary:")
    print("\nFraud_Data Dataset:")
    for model, (auc_pr, f1) in fraud_results.items():
        print(f"{model}: AUC-PR = {auc_pr:.4f}, F1-Score = {f1:.4f}")
    
    print("\ncreditcard Dataset:")
    for model, (auc_pr, f1) in creditcard_results.items():
        print(f"{model}: AUC-PR = {auc_pr:.4f}, F1-Score = {f1:.4f}")
    
    # Determine best model
    fraud_best = max(fraud_results.items(), key=lambda x: x[1][0])  # Based on AUC-PR
    creditcard_best = max(creditcard_results.items(), key=lambda x: x[1][0])
    
    print(f"\nBest Model for Fraud_Data: {fraud_best[0]} (AUC-PR: {fraud_best[1][0]:.4f})")
    print(f"Best Model for creditcard: {creditcard_best[0]} (AUC-PR: {creditcard_best[1][0]:.4f})")
    
    print("\nJustification:")
    print("Random Forest is likely preferred as the best model because:")
    print("- It typically handles imbalanced datasets better due to its ensemble nature")
    print("- It captures non-linear relationships and feature interactions")
    print("- It showed higher AUC-PR scores, indicating better performance in ranking positive cases")
    print("- It maintains robust performance across both datasets")
    print("Logistic Regression, while interpretable, may struggle with complex patterns in fraud detection")

# Run comparison
compare_models(fraud_results, creditcard_results)