# Fraud Detection - Exploratory Data Analysis

This notebook explores the synthetic fraud detection dataset and provides insights into transaction patterns.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style
plt.style.use('default')
sns.set_palette('husl')

# Display options
pd.set_option('display.max_columns', None)
print('✅ Libraries imported successfully!')

✅ Libraries imported successfully!


In [2]:
# Generate data first
exec(open('../src/data_processing/generate_data.py').read())

# Load the dataset
df = pd.read_csv('../data/raw/fraud_data.csv')
print(f'Dataset loaded: {df.shape}')
df.head()

2026-01-08 14:30:59,264 - INFO - Generating 100,000 synthetic transactions...
2026-01-08 14:30:59,833 - INFO - Dataset validation passed âœ“



DATASET STATISTICS
Total transactions: 100,000
Fraud transactions: 1,485
Normal transactions: 98,515
Fraud rate: 0.0149 (1.49%)

Transaction Amounts:
  Average: $36.79
  Median: $16.89
  Range: $0.06 - $3849.04
  Normal avg: $35.09
  Fraud avg: $149.63

Dataset shape: (100000, 27)
Memory usage: 19.46 MB


2026-01-08 14:31:04,284 - INFO - Dataset saved to data/raw/fraud_data.csv
2026-01-08 14:31:04,307 - INFO - Metadata saved to data/raw/fraud_data_metadata.json



SAMPLE DATA (First 5 rows)
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     4  4.428045  1.109924  3.345050  3.188642  4.256887  1.606718  0.763319   
1    15  3.081321  2.018920 -1.393076 -1.216091  0.838854 -0.759247 -0.182733   
2    16  4.056264 -2.412482 -0.150093 -0.973169  0.426026  1.599022  1.766411   
3    16 -1.999742 -0.379344 -1.237552 -1.808230  0.083780  0.340730  3.660885   
4    22  1.396166  0.346463  0.097833  0.168519  0.529258 -2.048030  0.760768   

         V8         V9       V10       V11       V12       V13       V14  \
0 -1.300184 -10.295302 -2.477898 -0.634426 -1.302676 -1.134220 -2.116211   
1 -0.707377   2.655656 -1.168760 -0.838028 -1.739355 -4.281521 -3.541347   
2  0.011623   0.102847 -0.337497  1.185675  3.121368 -6.900298 -0.007674   
3  3.810006   0.715764  0.521682  0.447361  2.025609  2.351586  2.008101   
4  2.076515  -0.213256 -2.080317 -0.172899 -1.544894 -0.533821 -2.041588   

        V15       V16       

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,Hour,DayOfWeek,MerchantCategory,CustomerRiskScore,Amount,Class
0,4,4.428045,1.109924,3.34505,3.188642,4.256887,1.606718,0.763319,-1.300184,-10.295302,-2.477898,-0.634426,-1.302676,-1.13422,-2.116211,3.248923,3.348843,0.447034,1.310908,2.106611,-2.600092,0,0,2,0.131,17.03,0
1,15,3.081321,2.01892,-1.393076,-1.216091,0.838854,-0.759247,-0.182733,-0.707377,2.655656,-1.16876,-0.838028,-1.739355,-4.281521,-3.541347,-2.627344,0.596624,0.239004,-3.941651,-1.279529,-4.456946,0,0,2,0.581,3.64,0
2,16,4.056264,-2.412482,-0.150093,-0.973169,0.426026,1.599022,1.766411,0.011623,0.102847,-0.337497,1.185675,3.121368,-6.900298,-0.007674,1.776175,-1.820601,1.076954,-1.387058,1.538689,-4.239564,0,0,4,0.649,13.07,0
3,16,-1.999742,-0.379344,-1.237552,-1.80823,0.08378,0.34073,3.660885,3.810006,0.715764,0.521682,0.447361,2.025609,2.351586,2.008101,4.572235,2.327698,-0.821197,0.422152,3.268561,-1.66596,0,0,0,0.237,37.11,0
4,22,1.396166,0.346463,0.097833,0.168519,0.529258,-2.04803,0.760768,2.076515,-0.213256,-2.080317,-0.172899,-1.544894,-0.533821,-2.041588,2.162696,1.566089,-0.068626,-0.845727,4.910148,0.124911,0,0,0,0.136,33.85,0


In [3]:
# Basic dataset information
print('=== DATASET OVERVIEW ===')
print(f'Shape: {df.shape}')
print(f'Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')
print('\n=== FRAUD STATISTICS ===')
print(f'Total transactions: {len(df):,}')
print(f'Fraud cases: {df["Class"].sum():,}')
print(f'Normal cases: {(df["Class"] == 0).sum():,}')
print(f'Fraud rate: {df["Class"].mean():.4f} ({df["Class"].mean()*100:.2f}%)')

=== DATASET OVERVIEW ===
Shape: (100000, 27)
Memory usage: 20.60 MB

=== FRAUD STATISTICS ===
Total transactions: 100,000
Fraud cases: 1,485
Normal cases: 98,515
Fraud rate: 0.0149 (1.49%)
