In [None]:
# ============================================================
# NOTEBOOK 3: FEATURE ENGINEERING - ATO Risk Profiler
# ============================================================
# Goal: Create predictive features from raw transaction data.
# Key Features:
#   - Velocity (speed of transactions)
#   - Aggregations (user history)
#   - Time & Behavior patterns
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings

# Configuration
warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

# Load dataset
df = pd.read_csv('../data/simulated_transactions.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort by user and time (Crucial for lag features)
df = df.sort_values(['user_id', 'timestamp']).reset_index(drop=True)

print("Dataset loaded & sorted")
print(f"Shape: {df.shape}")
display(df.head(3))

In [None]:
# ============================================================
# 1. TIME-BASED FEATURES
# ============================================================

# Basic extraction
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Cyclical encoding for Hour (Preserves 23:00 -> 00:00 proximity)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Time since last transaction (per user)
df['time_diff_seconds'] = df.groupby('user_id')['timestamp'].diff().dt.total_seconds().fillna(0)

print("âœ… Time features created:")
print(df[['timestamp', 'hour', 'is_weekend', 'hour_sin', 'time_diff_seconds']].head())

In [None]:
# ============================================================
# 2. VELOCITY FEATURES (Rolling Windows)
# ============================================================
# How many transactions did this user do in the last X hours?

# Set index for rolling
df_rolling = df.set_index('timestamp').groupby('user_id')

# Count transactions in last 1 hour, 24 hours, 7 days
# Note: '1h' requires time index
df['tx_count_1h'] = df_rolling['amount'].rolling('1h').count().values
df['tx_count_24h'] = df_rolling['amount'].rolling('24h').count().values
df['tx_count_7d'] = df_rolling['amount'].rolling('7d').count().values

# Sum of amount in last 24h
df['amount_sum_24h'] = df_rolling['amount'].rolling('24h').sum().values

print("âœ… Velocity features created (tx counts per window)")
display(df[['user_id', 'timestamp', 'tx_count_1h', 'tx_count_24h']].head(10))

In [None]:
# ============================================================
# 3. BEHAVIORAL AGGREGATIONS (User History)
# ============================================================
# Comparing current transaction vs user's historical average

# Calculate historical average amount (expanding window to avoid data leakage)
df['avg_amount_history'] = df.groupby('user_id')['amount'].expanding().mean().reset_index(level=0, drop=True)

# Ratio: Current Amount / Average Amount
# High ratio (>3 or >5) indicates potential anomaly
df['amount_ratio'] = df['amount'] / df['avg_amount_history']
df['amount_ratio'] = df['amount_ratio'].fillna(1) # Handle first transaction

# Device change tracking
# 1 if device changed from previous transaction, 0 otherwise
df['device_changed'] = (df.groupby('user_id')['device_type'].shift(1) != df['device_type']).astype(int)

# Country change tracking
df['country_changed'] = (df.groupby('user_id')['merchant_country'].shift(1) != df['merchant_country']).astype(int)

print("âœ… Behavioral features created")
display(df[['user_id', 'amount', 'avg_amount_history', 'amount_ratio', 'device_changed']].head())

In [None]:
# ============================================================
# 4. GEO-VELOCITY (Simplified Impossible Travel)
# ============================================================
# Did the user change country in a very short time?

# Create a risk flag if:
# Country changed AND time difference < 2 hours (7200 seconds)
# This is a strong indicator of VPN/Proxy or Simultaneous Login (ATO)

df['quick_country_change'] = (
    (df['country_changed'] == 1) & 
    (df['time_diff_seconds'] < 7200) & 
    (df['time_diff_seconds'] > 0) # Ignore same timestamp
).astype(int)

# Check how many flagged
print(f"ðŸš© Suspicious 'Impossible Travel' events found: {df['quick_country_change'].sum()}")

# Inspect some cases
display(df[df['quick_country_change'] == 1][['user_id', 'timestamp', 'merchant_country', 'time_diff_seconds', 'is_fraud']].head())

In [None]:
# ============================================================
# 5. CLEANUP & EXPORT
# ============================================================

# Select columns for Model Training
features = [
    'amount', 'hour_sin', 'hour_cos', 'day_of_week', 'is_weekend', # Time
    'tx_count_1h', 'tx_count_24h', 'tx_count_7d', 'amount_sum_24h', # Velocity
    'time_diff_seconds', 'amount_ratio', # Behavior
    'device_changed', 'country_changed', 'quick_country_change' # Risk Flags
]

target = 'is_fraud'
meta = ['transaction_id', 'user_id', 'timestamp', 'fraud_type'] # Metadata to keep but not train on

# Final DataFrame
df_final = df[meta + features + [target]].copy()

# Fill NaNs if any (usually first transactions)
df_final = df_final.fillna(0)

print(f"âœ… Final Feature Set Ready: {df_final.shape}")
print(f"   Features: {len(features)}")
print(f"   Target: {target}")

# Save to processed data folder
import os
os.makedirs('../data/processed', exist_ok=True)
df_final.to_csv('../data/processed/features_engineered.csv', index=False)
print("\nðŸ’¾ Saved to: data/processed/features_engineered.csv")

In [None]:
# ============================================================
# VALIDATION: Feature Power (Clean Boxplots)
# ============================================================

# Features to analyze (Using 7-day velocity for better spread)
features_to_check = ['amount_ratio', 'time_diff_seconds', 'tx_count_7d']
feature_names = ['Amount Ratio (vs Avg)', 'Time Since Last Tx (Sec)', 'Velocity (7-day count)']

plt.figure(figsize=(18, 6))

for i, col in enumerate(features_to_check):
    plt.subplot(1, 3, i+1)
    
    # Boxplot with outliers suppressed
    sns.boxplot(
        x='is_fraud', y=col, hue='is_fraud', data=df_final, 
        palette=["#3498db", "#e74c3c"], 
        showfliers=False, 
        width=0.5,
        legend=False
    )
    
    plt.title(f'Discriminative Power:\n{feature_names[i]}', fontsize=12, fontweight='bold')
    plt.xlabel('Class (0=Legit, 1=Fraud)', fontsize=10)
    plt.ylabel('Feature Value', fontsize=10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# Print median comparison to confirm separation numerically
print("Median Values Comparison (Fraud vs Legit):")
medians = df_final.groupby('is_fraud')[features_to_check].median()
display(medians)