## EDA — Fraud_Data (e-commerce)

This notebook focuses on Task 1 EDA: distributions, target imbalance, and country-level fraud patterns after IP→country merge.


In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from src.data.fraud_features import clean_fraud_df, engineer_fraud_features
from src.data.ip_utils import attach_country_by_ip_range

RAW_DIR = Path("../data/raw")
fraud_path = RAW_DIR / "Fraud_Data.csv"
ip_path = RAW_DIR / "IpAddress_to_Country.csv"

fraud = pd.read_csv(fraud_path)
ip_map = pd.read_csv(ip_path)

fraud = clean_fraud_df(fraud)
fraud = attach_country_by_ip_range(fraud, ip_map, out_col="country")
fraud = engineer_fraud_features(fraud)

fraud.head()


In [None]:
# Class imbalance
class_counts = fraud["class"].value_counts().rename_axis("class").reset_index(name="count")
class_counts["rate"] = class_counts["count"] / class_counts["count"].sum()
class_counts


In [None]:
# Univariate example: purchase_value
plt.figure(figsize=(7, 4))
sns.histplot(
    data=fraud,
    x="purchase_value",
    hue="class",
    bins=50,
    element="step",
    stat="density",
    common_norm=False,
)
plt.title("purchase_value by class")
plt.tight_layout()
plt.show()


In [None]:
# Country-level fraud rate (top countries by volume)
country_stats = fraud.groupby("country").agg(
    n=("class", "size"),
    fraud_rate=("class", "mean"),
).reset_index()

country_stats = country_stats.sort_values("n", ascending=False).head(20)
country_stats.sort_values("fraud_rate", ascending=False)
