# EDA: Exploratory Data Analysis

### Importing required libraries

In [14]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Where to save EDA outputs (PNGs / CSVs)
OUT_DIR = "../eda_outputs"

INPUT_PATH = "../data/transactions.csv"


### Loading and standardising the data

In [13]:
transactions = pd.read_csv(INPUT_PATH)

# standardize column names (same as preprocessing)
transactions.columns = (
    transactions.columns.str.strip()
                .str.lower()
                .str.replace(r'[^a-z0-9]+', '_', regex=True)
)

print("Loaded transactions:", transactions.shape)
transactions.head()


Loaded transactions: (50000, 21)


Unnamed: 0,transaction_id,user_id,transaction_amount,transaction_type,timestamp,account_balance,device_type,location,merchant_category,ip_address_flag,...,daily_transaction_count,avg_transaction_amount_7d,failed_transaction_count_7d,card_type,card_age,transaction_distance,authentication_method,risk_score,is_weekend,fraud_label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.84,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.2,Tablet,New York,Clothing,0,...,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1


### EDA Plots

In [15]:
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

# Fraud vs Legitimate
plt.figure(figsize=(6,4))
sns.countplot(data=transactions, x="fraud_label", hue="fraud_label", palette="Set2", legend=False)
plt.title("Fraud vs Legitimate Transactions")
plt.savefig(f"{OUT_DIR}/fraud_vs_legit.png")
plt.close()

# Transaction Amount Distribution
if "transaction_amount" in transactions.columns:
    plt.figure(figsize=(8,5))
    sns.histplot(data=transactions, x="transaction_amount", bins=50, kde=True)
    plt.title("Transaction Amount Distribution")
    plt.savefig(f"{OUT_DIR}/amount_distribution.png")
    plt.close()

# Fraud by Transaction Type
if "transaction_type" in transactions.columns:
    plt.figure(figsize=(8,5))
    sns.countplot(data=transactions, x="transaction_type", hue="fraud_label", palette="coolwarm")
    plt.title("Fraud by Transaction Type")
    plt.xticks(rotation=0)
    plt.savefig(f"{OUT_DIR}/fraud_by_type.png")
    plt.close()

# Fraud by Device Type
if "device_type" in transactions.columns:
    plt.figure(figsize=(8,5))
    sns.countplot(data=transactions, x="device_type", hue="fraud_label", palette="Set1")
    plt.title("Fraud by Device Type")
    plt.xticks(rotation=0)
    plt.savefig(f"{OUT_DIR}/fraud_by_device.png")
    plt.close()

# Fraud by Merchant Category
if "merchant_category" in transactions.columns:
    plt.figure(figsize=(10,5))
    sns.countplot(data=transactions, x="merchant_category", hue="fraud_label", palette="Dark2")
    plt.title("Fraud by Merchant Category")
    plt.xticks(rotation=0)
    plt.savefig(f"{OUT_DIR}/fraud_by_merchant.png")
    plt.close()

# Risk Score Distribution
if "risk_score" in transactions.columns:
    plt.figure(figsize=(8,5))
    sns.histplot(data=transactions, x="risk_score", bins=50, kde=True, hue="fraud_label", palette="husl")
    plt.title("Risk Score Distribution by Fraud Label")
    plt.savefig(f"{OUT_DIR}/risk_score.png")
    plt.close()

# Correlation Heatmap
plt.figure(figsize=(12,8))
corr = transactions.corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap")
plt.savefig(f"{OUT_DIR}/correlation_heatmap.png")
plt.close()

print("EDA plots saved in", OUT_DIR)


EDA plots saved in ../eda_outputs
