In [None]:
# Fraud Detection Notebook - Enhanced Version

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
# Load dataset
df = pd.read_csv("../data/synthetic_data.csv")

In [None]:
# Display basic summary
print("=== Summary Statistics ===")
print(df.describe(include='all'))
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Fraud Distribution
plt.figure(figsize=(6, 4))
df['is_fraud'].value_counts().plot(kind='bar', color=['blue', 'orange'])
plt.title("Fraud vs Legit Transactions")
plt.xticks([0, 1], ['Legit', 'Fraud'])
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Transaction Amount Distribution
plt.figure(figsize=(8, 4))
sns.histplot(df['amount'], bins=30, kde=True)
plt.title("Transaction Amount Distribution")
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Fraud Rate by Transaction Type
fraud_rate_by_type = df.groupby("transaction_type")["is_fraud"].mean()
fraud_rate_by_type.plot(kind='bar', color='purple')
plt.title("Fraud Rate by Transaction Type")
plt.ylabel("Fraud Rate")
plt.show()

In [None]:
# High Risk Locations
location_fraud = df.groupby('location')['is_fraud'].mean().sort_values(ascending=False)
location_fraud.plot(kind='bar', color='red')
plt.title("Fraud Rate by Location")
plt.ylabel("Fraud Rate")
plt.show()

In [None]:
# Feature Selection
features = ['amount', 'is_international', 'merchant_id']
target = 'is_fraud'

In [None]:
X = df[features]
y = df[target]

In [None]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [None]:
# Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Legit', 'Fraud'], yticklabels=['Legit', 'Fraud'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# ROC Curve
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
# Daily Transaction Volume
df['timestamp'] = pd.to_datetime(df['timestamp'])
daily_volume = df.set_index('timestamp').resample('D').size()
daily_volume.plot(figsize=(10, 4), title="Daily Transaction Volume")
plt.ylabel("Transactions")
plt.xlabel("Date")
plt.tight_layout()
plt.show()

In [None]:
# Device Type vs Fraud
device_fraud = df.groupby("device_type")["is_fraud"].mean()
device_fraud.plot(kind='bar', color='orange')
plt.title("Fraud Rate by Device Type")
plt.ylabel("Fraud Rate")
plt.show()

In [None]:
# Transactions by Merchant
top_merchants = df['merchant_id'].value_counts().head(10)
top_merchants.plot(kind='bar', title='Top 10 Merchants by Transaction Count')
plt.xlabel("Merchant ID")
plt.ylabel("Transaction Count")
plt.tight_layout()
plt.show()

In [None]:
# Fraud Transactions Only
fraud_df = df[df['is_fraud'] == 1]
print(f"Fraud Transactions: {len(fraud_df)} out of {len(df)}")
print(fraud_df.head())

In [None]:
# Save processed data
df.to_csv("../data/processed_data.csv", index=False)
print("Processed data saved to ../data/processed_data.csv")