In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset
df = pd.read_csv("../data/synthetic_data.csv")

In [None]:
# Basic distribution summary
total = len(df)
fraud = df['is_fraud'].sum()
legit = total - fraud
fraud_pct = (fraud / total) * 100

In [None]:
print(f"Total transactions: {total}")
print(f"Fraudulent: {fraud} ({fraud_pct:.2f}%)")
print(f"Legit: {legit} ({100 - fraud_pct:.2f}%)")

In [None]:
# Bar chart for fraud distribution
plt.figure(figsize=(6, 4))
df["is_fraud"].value_counts().sort_index().plot(kind='bar', color=['green', 'red'])
plt.title("Fraud vs Legitimate Transactions")
plt.xticks([0, 1], ['Legit', 'Fraud'], rotation=0)
plt.ylabel("Count")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Pie chart
plt.figure(figsize=(5, 5))
df["is_fraud"].value_counts().plot.pie(labels=['Legit', 'Fraud'], autopct='%1.1f%%', colors=['green', 'red'], startangle=90)
plt.title("Fraud Distribution")
plt.ylabel("")
plt.tight_layout()
plt.show()

In [None]:
# Annotated bar chart
counts = df["is_fraud"].value_counts().sort_index()
plt.figure(figsize=(6, 4))
bars = plt.bar(['Legit', 'Fraud'], counts, color=['green', 'red'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 2, int(yval), ha='center', va='bottom')
plt.title("Annotated Fraud Distribution")
plt.ylabel("Transaction Count")
plt.tight_layout()
plt.show()

In [None]:
# Histogram of amount vs fraud
plt.figure(figsize=(8, 4))
sns.histplot(data=df, x='amount', hue='is_fraud', bins=30, palette={0: 'green', 1: 'red'}, kde=True, element='step')
plt.title("Transaction Amount Distribution by Fraud Status")
plt.xlabel("Amount")
plt.tight_layout()
plt.show()

In [None]:
# Fraud rate by transaction type
if 'transaction_type' in df.columns:
    rate_by_type = df.groupby("transaction_type")["is_fraud"].mean().sort_values()
    rate_by_type.plot(kind='barh', color='purple')
    plt.title("Fraud Rate by Transaction Type")
    plt.xlabel("Fraud Rate")
    plt.tight_layout()
    plt.show()

In [None]:
# Optional: Save the fraud stats summary
summary = {
    "total": total,
    "fraud": fraud,
    "legit": legit,
    "fraud_pct": fraud_pct
}
summary_df = pd.DataFrame([summary])
summary_df.to_csv("../data/fraud_summary.csv", index=False)
print("Saved summary to ../data/fraud_summary.csv")