# Exploratory Data Analysis (EDA)

In-depth exploration of the credit card transactions dataset, focusing on data quality, class imbalance, temporal patterns, user demographics, merchant/category behavior, and distance-based signals.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8")


In [None]:
# Load raw data
from pathlib import Path

project_root = Path.cwd()
if project_root.name == "notebook":
    project_root = project_root.parent

data_path = project_root / "data" / "fraudTest.csv"
raw = pd.read_csv(data_path)

# Remove index-like column if present
if "Unnamed: 0" in raw.columns:
    raw = raw.drop(columns=["Unnamed: 0"])

raw.head()


In [None]:
# Basic overview
print("Rows:", raw.shape[0])
print("Columns:", raw.shape[1])

raw.info()


In [None]:
# Missing values and duplicates
missing = raw.isna().sum().sort_values(ascending=False)
missing[missing > 0]


In [None]:
dup_count = raw.duplicated().sum()
print("Duplicate rows:", dup_count)


In [None]:
# Class balance
fraud_counts = raw["is_fraud"].value_counts()
fraud_rate = raw["is_fraud"].mean()
print(fraud_counts)
print("Fraud rate:", round(float(fraud_rate), 6))

sns.countplot(x="is_fraud", data=raw)
plt.title("Fraud vs Non-Fraud Transactions")
plt.xlabel("Is Fraud")
plt.ylabel("Count")
plt.show()


In [None]:
# Convert time columns
raw["trans_date_trans_time"] = pd.to_datetime(raw["trans_date_trans_time"])
raw["dob"] = pd.to_datetime(raw["dob"])

raw["trans_hour"] = raw["trans_date_trans_time"].dt.hour
raw["trans_dayofweek"] = raw["trans_date_trans_time"].dt.dayofweek
raw["trans_month"] = raw["trans_date_trans_time"].dt.month

raw[["trans_date_trans_time", "trans_hour", "trans_dayofweek", "trans_month"]].head()


In [None]:
# Age feature
raw["age"] = ((raw["trans_date_trans_time"] - raw["dob"]).dt.days / 365.25).astype("float")
raw["age"].describe()


In [None]:
# Sampling for heavy plots
sample_size = min(200000, len(raw))
raw_sample = raw.sample(sample_size, random_state=42)
raw_sample.shape


In [None]:
# Transaction amount distribution
plt.figure(figsize=(8, 4))
sns.histplot(raw_sample["amt"], bins=60, kde=True)
plt.title("Transaction Amount Distribution")
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(8, 4))
sns.histplot(np.log1p(raw_sample["amt"]), bins=60, kde=True)
plt.title("Log-Scaled Transaction Amount Distribution")
plt.xlabel("log(1 + Amount)")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(6, 4))
sns.boxplot(x="is_fraud", y="amt", data=raw_sample)
plt.title("Amount by Fraud Label")
plt.xlabel("Is Fraud")
plt.ylabel("Amount")
plt.show()


In [None]:
# Fraud rate by amount bins
amt_bins = pd.qcut(raw["amt"], q=10, duplicates="drop")
fraud_by_amt = raw.groupby(amt_bins)["is_fraud"].mean()

plt.figure(figsize=(9, 4))
fraud_by_amt.plot(kind="bar")
plt.title("Fraud Rate by Amount Decile")
plt.xlabel("Amount Decile")
plt.ylabel("Fraud Rate")
plt.xticks(rotation=45, ha="right")
plt.show()


In [None]:
# Time-based patterns
plt.figure(figsize=(8, 4))
sns.countplot(x="trans_hour", data=raw_sample)
plt.title("Transactions by Hour")
plt.xlabel("Hour")
plt.ylabel("Count")
plt.show()

plt.figure(figsize=(8, 4))
raw.groupby("trans_hour")["is_fraud"].mean().plot(kind="bar")
plt.title("Fraud Rate by Hour")
plt.xlabel("Hour")
plt.ylabel("Fraud Rate")
plt.show()

plt.figure(figsize=(8, 4))
raw.groupby("trans_dayofweek")["is_fraud"].mean().plot(kind="bar")
plt.title("Fraud Rate by Day of Week")
plt.xlabel("Day of Week (0=Mon)")
plt.ylabel("Fraud Rate")
plt.show()


In [None]:
# Distance feature (Haversine) between customer and merchant

def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

raw["distance_km"] = haversine(raw["lat"], raw["long"], raw["merch_lat"], raw["merch_long"])
raw["distance_km"].describe()


In [None]:
# Fraud rate by distance bins
valid_dist = raw["distance_km"].replace([np.inf, -np.inf], np.nan).dropna()
dist_bins = pd.qcut(valid_dist, q=10, duplicates="drop")
fraud_by_dist = raw.loc[valid_dist.index].groupby(dist_bins)["is_fraud"].mean()

plt.figure(figsize=(9, 4))
fraud_by_dist.plot(kind="bar")
plt.title("Fraud Rate by Distance Decile")
plt.xlabel("Distance Decile")
plt.ylabel("Fraud Rate")
plt.xticks(rotation=45, ha="right")
plt.show()


In [None]:
# Age-based analysis
age_bins = pd.cut(raw["age"], bins=[18, 25, 35, 45, 55, 65, 75, 90])
fraud_by_age = raw.groupby(age_bins)["is_fraud"].mean()

plt.figure(figsize=(8, 4))
fraud_by_age.plot(kind="bar")
plt.title("Fraud Rate by Age Bin")
plt.xlabel("Age Bin")
plt.ylabel("Fraud Rate")
plt.xticks(rotation=45, ha="right")
plt.show()


In [None]:
# Categorical feature cardinality
categorical_cols = raw.select_dtypes(include=["object"]).columns
pd.DataFrame({
    "feature": categorical_cols,
    "unique_values": [raw[col].nunique() for col in categorical_cols],
}).sort_values(by="unique_values", ascending=False)


In [None]:
# Fraud rate for categories with enough volume

def fraud_rate_by_category(df, col, min_count=1000):
    counts = df[col].value_counts()
    keep = counts[counts >= min_count].index
    rates = df[df[col].isin(keep)].groupby(col)["is_fraud"].mean().sort_values(ascending=False)
    return rates

for col in ["category", "merchant", "job", "state", "city"]:
    rates = fraud_rate_by_category(raw, col, min_count=500)
    print(f"
Top fraud rates for {col} (min 500 rows):")
    print(rates.head(10))


In [None]:
# Numeric correlations
numeric_cols = raw.select_dtypes(include=["number"]).columns
corr = raw[numeric_cols].corr()

plt.figure(figsize=(11, 9))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Numeric Feature Correlations")
plt.show()


In [None]:
# Point-biserial correlation with target
from scipy.stats import pointbiserialr

pb_results = []
for col in numeric_cols:
    if col == "is_fraud":
        continue
    clean = raw[[col, "is_fraud"]].dropna()
    if clean[col].nunique() > 1:
        r, p = pointbiserialr(clean["is_fraud"], clean[col])
        pb_results.append((col, r, p))

pb_df = pd.DataFrame(pb_results, columns=["feature", "r", "p_value"]).sort_values(by="r", ascending=False)
pb_df.head(15)


## Summary

- The dataset is highly imbalanced, so precision/recall and PR-AUC will be more informative than accuracy.
- Time-of-day, amount, distance, and category/merchant/job patterns show distinct fraud-rate differences.
- High-cardinality categorical features may need encoding or grouping for modeling.
