# Financial Fraud Detection - Exploratory Data Analysis

This notebook provides a comprehensive exploratory data analysis of the Financial Fraud Detection dataset by Aman Ali Siddiqui from Kaggle. We'll examine transaction patterns, fraud distribution, and key insights to guide our modeling approach.

## Table of Contents
1. [Project Structure Setup](#project-structure)
2. [Requirements File Creation](#requirements)
3. [Data Loading and Initial Exploration](#data-loading)
4. [Preprocessing Script Development](#preprocessing)
5. [Feature Engineering Implementation](#feature-engineering)
6. [Model Development Framework](#modeling)
7. [Evaluation Metrics Setup](#evaluation)
8. [Streamlit App Foundation](#streamlit-app)
9. [Documentation Setup](#documentation)

## 1. Project Structure Setup {#project-structure}

Let's start by creating the complete folder structure for our fraud detection project using Python's pathlib module.


In [None]:
# Import necessary libraries for project setup
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set style for matplotlib
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")
print(f"Python version: {sys.version}")




In [None]:
# Create project directory structure
project_root = Path("../")  # Parent directory of notebooks folder
directories = [
    "data/raw",
    "data/processed", 
    "data/interim",
    "notebooks",
    "src",
    "streamlit_app",
    "models",
    "reports/figures",
    "reports/results",
    "config"
]

# Create directories if they don't exist
for directory in directories:
    dir_path = project_root / directory
    dir_path.mkdir(parents=True, exist_ok=True)
    print(f"✓ Created/verified directory: {directory}")

# Display project structure
print("\n Project Structure:")
print("financial_fraud_detection/")
for directory in directories:
    level = len(directory.split('/')) - 1
    indent = "  " * level
    folder_name = directory.split('/')[-1]
    print(f"{indent}├── {folder_name}/")

print("\n Project structure setup complete!")



## 2. Requirements File Creation {#requirements}

Let's create a comprehensive requirements.txt file with all necessary libraries for our fraud detection project.



In [None]:
# Generate requirements.txt content
requirements_content = """# Core Data Science Libraries
pandas>=1.5.0
numpy>=1.21.0
scikit-learn>=1.1.0
scipy>=1.9.0

# Machine Learning & Advanced Models
xgboost>=1.6.0
lightgbm>=3.3.0
catboost>=1.0.0
imbalanced-learn>=0.9.0

# Visualization Libraries
matplotlib>=3.5.0
seaborn>=0.11.0
plotly>=5.10.0

# Web Application
streamlit>=1.15.0

# Jupyter Notebook
jupyter>=1.0.0
notebook>=6.4.0
ipywidgets>=7.7.0

# Data Processing & Utils
joblib>=1.1.0
openpyxl>=3.0.0

# Model Interpretation
shap>=0.41.0

# Progress Bars & Utilities
tqdm>=4.64.0

# Development & Testing
pytest>=7.0.0

# Environment Management
python-dotenv>=0.19.0
"""



# Write requirements.txt file
requirements_path = project_root / "requirements.txt"
with open(requirements_path, 'w') as f:
    f.write(requirements_content)

print("✓ requirements.txt file created successfully!")
print(f" Location: {requirements_path}")
print("\n  Key libraries included:")
print("  • pandas, numpy - Data manipulation")
print("  • scikit-learn - Machine learning")
print("  • xgboost, lightgbm - Advanced ML models")
print("  • matplotlib, seaborn, plotly - Visualization")
print("  • streamlit - Web application")
print("  • jupyter - Interactive notebooks")

## 3. Data Loading and Initial Exploration {#data-loading}

Now let's create sample data for demonstration and perform initial exploration. In a real project, you would load the actual Kaggle dataset here.


In [None]:
# Create sample fraud detection dataset (similar to Kaggle dataset structure)
np.random.seed(42)
n_samples = 10000

# Generate sample data with realistic patterns
sample_data = {
    'step': np.random.randint(1, 744, n_samples),  # 1 month in hours
    'type': np.random.choice(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'], 
                           n_samples, p=[0.4, 0.2, 0.2, 0.1, 0.1]),
    'amount': np.random.lognormal(5, 2, n_samples),
    'nameOrig': [f'C{i}' for i in np.random.randint(1, 1000, n_samples)],
    'oldbalanceOrg': np.random.lognormal(8, 2, n_samples),
    'newbalanceOrig': np.random.lognormal(8, 2, n_samples),
    'nameDest': [f'M{i}' for i in np.random.randint(1, 500, n_samples)],
    'oldbalanceDest': np.random.lognormal(8, 2, n_samples),
    'newbalanceDest': np.random.lognormal(8, 2, n_samples),
}

# Create DataFrame
df = pd.DataFrame(sample_data)

# Create fraud labels (5% fraud rate)
fraud_mask = np.random.choice([0, 1], n_samples, p=[0.95, 0.05])
df['isFraud'] = fraud_mask

# Make fraud cases more realistic
fraud_indices = df[df['isFraud'] == 1].index
df.loc[fraud_indices, 'type'] = np.random.choice(['TRANSFER', 'CASH_OUT'], len(fraud_indices))
df.loc[fraud_indices, 'amount'] = np.random.lognormal(7, 1.5, len(fraud_indices))

print("  Sample Dataset Created!")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# Display first few rows
print("\n  First 5 rows:")
df.head()

In [None]:
# Basic data exploration
print("  Dataset Overview:")
print("="*50)
print(f"Total transactions: {len(df):,}")
print(f"Fraud cases: {df['isFraud'].sum():,}")
print(f"Fraud rate: {(df['isFraud'].sum() / len(df)) * 100:.2f}%")
print(f"Legitimate transactions: {(df['isFraud'] == 0).sum():,}")

print("\n  Data Types:")
print(df.dtypes)

print("\n  Statistical Summary:")
df.describe()


In [None]:
# Visualize fraud distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Financial Fraud Detection - Data Overview', fontsize=16, fontweight='bold')

# 1. Fraud vs Legitimate distribution
fraud_counts = df['isFraud'].value_counts()
axes[0,0].pie(fraud_counts.values, labels=['Legitimate', 'Fraud'], autopct='%1.1f%%', 
              colors=['lightblue', 'red'], startangle=90)
axes[0,0].set_title('Transaction Distribution')

# 2. Transaction types
type_counts = df['type'].value_counts()
axes[0,1].bar(type_counts.index, type_counts.values, color='skyblue')
axes[0,1].set_title('Transaction Types')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Amount distribution by fraud status
df_fraud = df[df['isFraud'] == 1]['amount']
df_legit = df[df['isFraud'] == 0]['amount']

axes[1,0].hist([df_legit, df_fraud], bins=50, alpha=0.7, 
               label=['Legitimate', 'Fraud'], color=['blue', 'red'])
axes[1,0].set_xlabel('Amount')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_title('Amount Distribution by Fraud Status')
axes[1,0].set_yscale('log')
axes[1,0].legend()



# 4. Fraud by transaction type
fraud_by_type = df.groupby('type')['isFraud'].agg(['sum', 'count'])
fraud_rate_by_type = (fraud_by_type['sum'] / fraud_by_type['count'] * 100).sort_values(ascending=False)

axes[1,1].bar(fraud_rate_by_type.index, fraud_rate_by_type.values, color='coral')
axes[1,1].set_title('Fraud Rate by Transaction Type (%)')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].set_ylabel('Fraud Rate (%)')

plt.tight_layout()
plt.show()

print("  Key Insights:")
print(f"• Highest fraud rate transaction type: {fraud_rate_by_type.index[0]} ({fraud_rate_by_type.iloc[0]:.2f}%)")
print(f"• Average fraud transaction amount: ${df_fraud.mean():,.2f}")
print(f"• Average legitimate transaction amount: ${df_legit.mean():,.2f}")




## 4. Preprocessing Script Development {#preprocessing}

Let's create a comprehensive preprocessing module for our fraud detection system.

In [None]:
# Create preprocessing.py script
preprocessing_code = '''"""
Data Preprocessing Module for Financial Fraud Detection

This module contains functions for loading, cleaning, and preprocessing
the financial transaction data from the Kaggle dataset.
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataPreprocessor:
    """
    A class for preprocessing financial transaction data for fraud detection.
    """
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.feature_columns = []
        
    def load_data(self, file_path: str) -> pd.DataFrame:
        """Load data from CSV file."""
        try:
            logger.info(f"Loading data from {file_path}")
            df = pd.read_csv(file_path)
            logger.info(f"Data loaded successfully. Shape: {df.shape}")
            return df
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise
    
    def handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
        """Handle missing values in the dataset."""
        logger.info("Handling missing values")
        
        missing_summary = df.isnull().sum()
        if missing_summary.sum() > 0:
            logger.info(f"Missing values found: {missing_summary[missing_summary > 0]}")
            
            # Handle numerical columns
            numerical_cols = df.select_dtypes(include=[np.number]).columns
            for col in numerical_cols:
                if df[col].isnull().any():
                    df[col].fillna(df[col].median(), inplace=True)
            
            # Handle categorical columns
            categorical_cols = df.select_dtypes(include=['object']).columns
            for col in categorical_cols:
                if df[col].isnull().any():
                    df[col].fillna(df[col].mode()[0], inplace=True)
        
        logger.info("Missing values handled successfully")
        return df
    
    def encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode categorical features using label encoding."""
        logger.info("Encoding categorical features")
        
        categorical_cols = df.select_dtypes(include=['object']).columns
        df_encoded = df.copy()
        
        for col in categorical_cols:
            if col not in ['isFraud', 'nameOrig', 'nameDest']:
                le = LabelEncoder()
                df_encoded[col] = le.fit_transform(df[col].astype(str))
                self.label_encoders[col] = le
                logger.info(f"Encoded column: {col}")
        
        return df_encoded
    
    def prepare_data(self, df: pd.DataFrame, target_column: str = 'isFraud', 
                    test_size: float = 0.2, random_state: int = 42) -> tuple:
        """Complete data preprocessing pipeline."""
        logger.info("Starting complete data preprocessing pipeline")
        
        # Handle missing values
        df = self.handle_missing_values(df)
        
        # Encode categorical features
        df = self.encode_categorical_features(df)
        
        # Remove high cardinality columns
        columns_to_drop = ['nameOrig', 'nameDest']
        existing_columns = [col for col in columns_to_drop if col in df.columns]
        if existing_columns:
            df = df.drop(columns=existing_columns)
            logger.info(f"Dropped high cardinality columns: {existing_columns}")
        
        # Separate features and target
        if target_column in df.columns:
            X = df.drop(columns=[target_column])
            y = df[target_column]
        else:
            logger.warning(f"Target column '{target_column}' not found.")
            X = df
            y = None
        
        self.feature_columns = X.columns.tolist()
        
        # Split the data
        if y is not None:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=random_state, stratify=y
            )
            
            logger.info(f"Data preprocessing completed. Training set size: {X_train.shape}")
            return X_train, X_test, y_train, y_test
        else:
            return X, None, None, None
'''

# Write preprocessing.py file
preprocessing_path = project_root / "src" / "preprocessing.py"
with open(preprocessing_path, 'w') as f:
    f.write(preprocessing_code)

print("✓ preprocessing.py created successfully!")
print(f"  Location: {preprocessing_path}")
print("\n  Key preprocessing features:")
print("  • Missing value handling")
print("  • Categorical encoding")
print("  • Data splitting with stratification")
print("  • Logging for monitoring")