# Fraud Detection Pipeline
This notebook contains the consolidated fraud detection pipeline from Weeks 5-7.
It loads transaction data, engineers features, trains XGBoost, and logs to MLflow.

In [None]:
# =============================================================================
# Cell 1: Imports
# =============================================================================
# We import the core libraries needed for our fraud detection pipeline:
# - pandas/numpy for data manipulation
# - xgboost for our gradient boosting classifier
# - sklearn for evaluation metrics and data splitting
# - mlflow for experiment tracking and model logging

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.xgboost
import warnings

# Suppress warnings to keep notebook output clean during demos
warnings.filterwarnings('ignore')

print("All imports successful!")

In [None]:
# =============================================================================
# Cell 2: Load Data
# =============================================================================
# Load the transaction dataset from our local data directory.
# This CSV contains simulated credit card transactions with a binary
# 'is_fraud' label. We print basic stats to sanity-check the data.

df = pd.read_csv('../data/transactions_sample.csv')

# Print the shape so we know how many rows and columns we have
print(f"Dataset shape: {df.shape}")

# Print column names to understand what features are available
print(f"Columns: {list(df.columns)}")

# Print the fraud rate - this tells us how imbalanced the dataset is.
# Fraud detection is almost always highly imbalanced (fraud is rare),
# which affects our choice of metrics and training strategy.
fraud_rate = df['is_fraud'].mean()
print(f"Fraud rate: {fraud_rate:.4f} ({fraud_rate*100:.2f}%)")

In [None]:
# =============================================================================
# Cell 3: Explore Data
# =============================================================================
# Before building features, we explore the raw data to understand its
# structure, data types, and any quality issues like missing values.

# Preview the first few rows to see what the data looks like
print("=== First 5 Rows ===")
print(df.head())
print()

# Check shape again for confirmation
print(f"Shape: {df.shape}")
print()

# Data types help us understand which columns are numeric vs categorical
print("=== Data Types ===")
print(df.dtypes)
print()

# Missing values can cause model training to fail or produce biased results
print("=== Missing Values ===")
print(df.isnull().sum())
print()

# Fraud distribution shows us the class balance
print("=== Fraud Distribution ===")
print(df['is_fraud'].value_counts())
print()

# Compare transaction amounts between fraud and non-fraud
# Fraudulent transactions often have different amount patterns
print("=== Amount by Fraud Status ===")
print(df.groupby('is_fraud')['amount'].describe())

In [None]:
# =============================================================================
# Cell 4: Time-Based Feature Engineering
# =============================================================================
# Fraud patterns often depend on WHEN a transaction occurs.
# Transactions at 3am on a Sunday are more suspicious than at noon on Tuesday.
# We create several time-based features to capture these patterns.

# Our dataset already has 'hour' (0-23) and 'day_of_week' (0=Mon, 6=Sun).
# We use these to create binary and cyclical features.

# Is this a weekend transaction? Fraud patterns differ on weekends.
# day_of_week: Monday=0, Sunday=6, so >= 5 means Saturday or Sunday
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

# Is this a nighttime transaction? (between 10pm and 6am)
# Night transactions are statistically more likely to be fraudulent
df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)

# Cyclical encoding of hour using sin/cos.
# Why? Because hour 23 and hour 0 are close in time but far apart numerically.
# Sin/cos encoding preserves this cyclical relationship.
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Cyclical encoding of day of week (same reasoning as hour)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

print("Time features created:")
print(f"  is_weekend: {df['is_weekend'].value_counts().to_dict()}")
print(f"  is_night:   {df['is_night'].value_counts().to_dict()}")
print(f"  hour_sin range: [{df['hour_sin'].min():.3f}, {df['hour_sin'].max():.3f}]")
print(f"  hour_cos range: [{df['hour_cos'].min():.3f}, {df['hour_cos'].max():.3f}]")
print(f"  day_sin range:  [{df['day_sin'].min():.3f}, {df['day_sin'].max():.3f}]")
print(f"  day_cos range:  [{df['day_cos'].min():.3f}, {df['day_cos'].max():.3f}]")

In [None]:
# =============================================================================
# Cell 5: Amount-Based Feature Engineering
# =============================================================================
# Transaction amount is one of the most important features for fraud detection.
# Raw amounts can be skewed, so we create transformed versions.

# Log transform of amount using log1p (log(1+x) to handle zeros safely).
# This reduces the impact of extreme values and makes the distribution
# more normal, which helps tree-based models find better splits.
df['amount_log'] = np.log1p(df['amount'])

# Z-score normalization: how many standard deviations from the mean?
# Transactions with very high z-scores are unusual and potentially fraudulent.
df['amount_zscore'] = (df['amount'] - df['amount'].mean()) / df['amount'].std()

# Percentile rank: what fraction of transactions have a lower amount?
# A percentile of 0.99 means the transaction is larger than 99% of all transactions.
df['amount_percentile'] = df['amount'].rank(pct=True)

print("Amount features created:")
print(f"  amount_log range:        [{df['amount_log'].min():.3f}, {df['amount_log'].max():.3f}]")
print(f"  amount_zscore range:     [{df['amount_zscore'].min():.3f}, {df['amount_zscore'].max():.3f}]")
print(f"  amount_percentile range: [{df['amount_percentile'].min():.3f}, {df['amount_percentile'].max():.3f}]")

In [None]:
# =============================================================================
# Cell 6: Train/Test Split
# =============================================================================
# We split our data into training and test sets BEFORE training.
# This prevents data leakage - the model never sees test data during training.

# Define the feature columns we'll use for training.
# These are all the engineered features we created above.
feature_cols = [
    'amount', 'amount_log', 'amount_zscore', 'amount_percentile',
    'is_weekend', 'is_night',
    'hour_sin', 'hour_cos', 'day_sin', 'day_cos'
]

X = df[feature_cols]
y = df['is_fraud']

# Stratified split ensures both train and test sets have the same fraud rate.
# This is critical for imbalanced datasets - without stratification, the test
# set might have no fraud cases at all!
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size:     {X_test.shape[0]} samples")
print(f"Training fraud rate: {y_train.mean():.4f}")
print(f"Test fraud rate:     {y_test.mean():.4f}")
print(f"Number of features:  {len(feature_cols)}")

In [None]:
# =============================================================================
# Cell 7: Train XGBoost Model
# =============================================================================
# XGBoost is an excellent choice for tabular fraud detection because:
# - It handles imbalanced classes well (via scale_pos_weight)
# - It captures non-linear feature interactions automatically
# - It's fast to train and highly accurate on structured data

# Define hyperparameters for our XGBoost classifier
params = {
    'max_depth': 6,              # Maximum tree depth - controls model complexity
    'n_estimators': 100,         # Number of boosting rounds (trees)
    'learning_rate': 0.1,        # Step size shrinkage - prevents overfitting
    'eval_metric': 'logloss',    # Binary cross-entropy loss for classification
    'use_label_encoder': False,  # Suppress deprecation warning
    'random_state': 42           # Reproducibility
}

# Create and train the model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"Parameters: {params}")

In [None]:
# =============================================================================
# Cell 8: Evaluate Model
# =============================================================================
# For fraud detection, accuracy alone is misleading (a model that predicts
# "no fraud" for everything gets 99%+ accuracy on imbalanced data).
# We use multiple metrics to get a complete picture:
# - Precision: Of predicted frauds, how many are real? (false alarm rate)
# - Recall: Of actual frauds, how many did we catch? (miss rate)
# - F1: Harmonic mean of precision and recall
# - ROC-AUC: Overall ranking quality of the model

# Generate predictions on the held-out test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of fraud class

# Calculate all evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("=== Model Evaluation ===")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")
print()

# Confusion matrix shows the breakdown of correct/incorrect predictions
# Format: [[True Neg, False Pos], [False Neg, True Pos]]
cm = confusion_matrix(y_test, y_pred)
print("=== Confusion Matrix ===")
print(f"  True Negatives:  {cm[0][0]}")
print(f"  False Positives: {cm[0][1]}")
print(f"  False Negatives: {cm[1][0]}")
print(f"  True Positives:  {cm[1][1]}")

In [None]:
# =============================================================================
# Cell 9: MLflow Logging
# =============================================================================
# MLflow tracks our experiments so we can compare different model versions.
# We log parameters (what we configured), metrics (how well it performed),
# and the model itself (so we can reload it later for serving).

# Set the experiment name - this groups related runs together
mlflow.set_experiment("fraud-detection")

# Start a new run and log everything
with mlflow.start_run(run_name="xgboost-baseline"):
    # Log hyperparameters so we know exactly how this model was configured
    mlflow.log_params(params)

    # Log evaluation metrics so we can compare across runs
    mlflow.log_metrics({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    })

    # Log the trained model artifact - this saves the model so it can be
    # loaded later for inference or deployed to a serving endpoint
    mlflow.xgboost.log_model(model, "model")

    print("MLflow logging complete!")
    print(f"  Experiment: fraud-detection")
    print(f"  Run name:   xgboost-baseline")
    print(f"  Params logged: {len(params)}")
    print(f"  Metrics logged: 5")

In [None]:
# =============================================================================
# Cell 10: Feature Importance
# =============================================================================
# Understanding which features matter most helps us:
# - Explain the model to stakeholders ("amount is the top predictor")
# - Identify features to keep or remove in future iterations
# - Validate that the model learned sensible patterns

# Extract feature importances from the trained XGBoost model
# These represent how much each feature contributed to reducing prediction error
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
})

# Sort by importance descending so the most important features are at the top
importance_df = importance_df.sort_values('importance', ascending=False).reset_index(drop=True)

print("=== Feature Importance (sorted) ===")
for idx, row in importance_df.iterrows():
    # Print a simple bar chart using characters for visual clarity
    bar = '#' * int(row['importance'] * 50)
    print(f"  {row['feature']:25s} {row['importance']:.4f} {bar}")