# Credit Card Fraud Detection - Exploratory Data Analysis

This notebook contains exploratory data analysis (EDA) for the Credit Card Fraud Detection project.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")


## 1. Load Dataset

**Note:** Place your dataset in the `data/` directory. The dataset should be a CSV file with credit card transaction data.


In [None]:
# Load dataset
# Update the path to your dataset file
data_path = '../data/creditcard.csv'  # Adjust filename as needed

try:
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"\nFirst few rows:")
    df.head()
except FileNotFoundError:
    print(f"Dataset not found at {data_path}")
    print("Please place your dataset in the data/ directory")
    # Create sample data structure for demonstration
    print("\nExpected columns: Time, V1-V28 (anonymized features), Amount, Class")


## 2. Dataset Overview


In [None]:
# Basic information about the dataset
print("Dataset Info:")
print("="*50)
df.info()

print("\n\nDataset Statistics:")
print("="*50)
df.describe()


## 3. Distribution of Fraud vs Non-Fraud


In [None]:
# Check class distribution
if 'Class' in df.columns:
    fraud_counts = df['Class'].value_counts()
    fraud_percentages = df['Class'].value_counts(normalize=True) * 100
    
    print("Class Distribution:")
    print("="*50)
    print(f"Non-Fraud (0): {fraud_counts[0]:,} ({fraud_percentages[0]:.2f}%)")
    print(f"Fraud (1):     {fraud_counts[1]:,} ({fraud_percentages[1]:.2f}%)")
    print(f"\nImbalance Ratio: {fraud_counts[0]/fraud_counts[1]:.2f}:1")
    
    # Visualize class distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Count plot
    sns.countplot(data=df, x='Class', ax=axes[0], palette=['#3498db', '#e74c3c'])
    axes[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Class (0=Non-Fraud, 1=Fraud)')
    axes[0].set_ylabel('Count')
    axes[0].set_xticklabels(['Non-Fraud', 'Fraud'])
    
    # Pie chart
    fraud_counts.plot(kind='pie', ax=axes[1], autopct='%1.2f%%', 
                     colors=['#3498db', '#e74c3c'], startangle=90)
    axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()
else:
    print("'Class' column not found in dataset")


## 4. Correlation Heatmap


In [None]:
# Calculate correlation matrix
if 'Class' in df.columns:
    # Select numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Calculate correlation with target
    correlations = df[numeric_cols].corrwith(df['Class']).sort_values(ascending=False)
    
    print("Top 10 Features Correlated with Fraud (Class):")
    print("="*50)
    print(correlations.head(10))
    
    # Full correlation matrix (for a subset of features if dataset is large)
    if len(numeric_cols) > 30:
        # For large datasets, show correlation with target and key features
        key_features = ['Time', 'Amount', 'Class'] + [f'V{i}' for i in range(1, 11)]
        key_features = [f for f in key_features if f in df.columns]
        corr_matrix = df[key_features].corr()
    else:
        corr_matrix = df[numeric_cols].corr()
    
    # Plot heatmap
    plt.figure(figsize=(14, 12))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
else:
    print("'Class' column not found in dataset")


## 5. Transaction Amount Analysis


In [None]:
# Analyze transaction amounts by class
if 'Amount' in df.columns and 'Class' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Distribution of amounts
    df[df['Class'] == 0]['Amount'].hist(bins=50, ax=axes[0, 0], color='#3498db', alpha=0.7)
    axes[0, 0].set_title('Amount Distribution - Non-Fraud', fontweight='bold')
    axes[0, 0].set_xlabel('Amount')
    axes[0, 0].set_ylabel('Frequency')
    
    df[df['Class'] == 1]['Amount'].hist(bins=50, ax=axes[0, 1], color='#e74c3c', alpha=0.7)
    axes[0, 1].set_title('Amount Distribution - Fraud', fontweight='bold')
    axes[0, 1].set_xlabel('Amount')
    axes[0, 1].set_ylabel('Frequency')
    
    # Box plot
    df.boxplot(column='Amount', by='Class', ax=axes[1, 0])
    axes[1, 0].set_title('Amount by Class', fontweight='bold')
    axes[1, 0].set_xlabel('Class')
    axes[1, 0].set_ylabel('Amount')
    
    # Statistics
    stats = df.groupby('Class')['Amount'].agg(['mean', 'median', 'std', 'min', 'max'])
    stats.plot(kind='bar', ax=axes[1, 1], color=['#3498db', '#e74c3c'])
    axes[1, 1].set_title('Amount Statistics by Class', fontweight='bold')
    axes[1, 1].set_xlabel('Class')
    axes[1, 1].set_ylabel('Amount')
    axes[1, 1].legend(['Mean', 'Median', 'Std', 'Min', 'Max'])
    axes[1, 1].tick_params(axis='x', rotation=0)
    
    plt.tight_layout()
    plt.show()
    
    print("\nAmount Statistics by Class:")
    print("="*50)
    print(stats)
else:
    print("Required columns ('Amount', 'Class') not found")


## 6. Time-based Analysis


In [None]:
# Analyze transactions over time
if 'Time' in df.columns and 'Class' in df.columns:
    # Convert time to hours (assuming seconds since first transaction)
    df['Hour'] = (df['Time'] / 3600) % 24
    
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    # Transactions over time
    hourly_counts = df.groupby(['Hour', 'Class']).size().unstack(fill_value=0)
    hourly_counts.plot(kind='line', ax=axes[0], marker='o', linewidth=2)
    axes[0].set_title('Transaction Count by Hour of Day', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Hour of Day')
    axes[0].set_ylabel('Number of Transactions')
    axes[0].legend(['Non-Fraud', 'Fraud'])
    axes[0].grid(True, alpha=0.3)
    
    # Fraud rate by hour
    fraud_rate_by_hour = df.groupby('Hour')['Class'].mean() * 100
    fraud_rate_by_hour.plot(kind='bar', ax=axes[1], color='#e74c3c', alpha=0.7)
    axes[1].set_title('Fraud Rate by Hour of Day', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Hour of Day')
    axes[1].set_ylabel('Fraud Rate (%)')
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("Required columns ('Time', 'Class') not found")


## 7. Model Comparison (After Training)

This section will be populated after running the model training pipeline.


In [None]:
# Load model comparison results if available
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent / 'src'))

try:
    from evaluation import evaluate_all_models
    import joblib
    
    # Load models
    models_dir = Path('../models')
    models = {}
    
    model_files = {
        'logistic_regression': 'logistic_regression.pkl',
        'random_forest': 'random_forest.pkl',
        'gradient_boosting': 'gradient_boosting.pkl'
    }
    
    for name, filename in model_files.items():
        filepath = models_dir / filename
        if filepath.exists():
            models[name] = joblib.load(filepath)
            print(f"Loaded {name}")
    
    if models:
        print(f"\nLoaded {len(models)} model(s)")
        print("Note: Run model training and evaluation to see comparison results here")
    else:
        print("No trained models found. Please run the training pipeline first.")
        
except Exception as e:
    print(f"Model comparison not available: {e}")
    print("Please train models first using the model_training.py script")
