In [None]:
# ============================================================
# NOTEBOOK 5: ATO PATTERNS ANALYSIS
# ============================================================
# Goal: Deep dive into specific ATO attack patterns:
#   - Credential Stuffing: Multiple failed logins â†’ high value tx
#   - SIM Swap: Impossible geographic jumps
#   - Keylogger: Gradual behavioral divergence
#   - Brute Force: Extreme volume patterns
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import networkx as nx
from datetime import timedelta
import warnings

# Configuration
warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")

# Load ORIGINAL dataset (with fraud_type)
df = pd.read_csv('../data/simulated_transactions.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Load FEATURES dataset
df_features = pd.read_csv('../data/processed/features_engineered.csv')

# Merge on transaction_id (SAFER approach)
df_full = df.merge(df_features[['transaction_id', 'amount_ratio', 'tx_count_24h', 
                               'tx_count_7d', 'time_diff_seconds', 'device_changed', 
                               'country_changed', 'quick_country_change']], 
                   on='transaction_id', how='left')

print("Data loaded for pattern analysis")
print(f"Shape: {df_full.shape}")
print(f"Fraud types available: {df_full[df_full['is_fraud']==1]['fraud_type'].unique()}")
print(f"Features merged: {len(df_features.columns)}")

In [None]:
# ============================================================
# PATTERN 1: CREDENTIAL STUFFING
# ============================================================
# Characteristics: High amount after multiple login attempts

credential_df = df_full[df_full['fraud_type'] == 'credential_stuffing']

print("Credential Stuffing Analysis:")
print(f"Total attacks: {len(credential_df)}")
print(f"Avg Amount: {credential_df['amount'].mean():.2f} EUR")
print(f"Amount Ratio (vs user avg): {credential_df['amount_ratio'].mean():.2f}")

# Compare with legitimate transactions
legit_high_amount = df_full[(df_full['is_fraud']==0) & (df_full['amount'] > 50)]
print(f"Legitimate high-value tx: {len(legit_high_amount)}")

# Visualization: Amount distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(data=credential_df, x='amount', bins=30, alpha=0.7, label='Credential Stuffing', color='red')
sns.histplot(data=legit_high_amount, x='amount', bins=30, alpha=0.7, label='Legitimate High Value', color='blue')
plt.title('Amount Distribution Comparison')
plt.legend()

plt.subplot(1, 2, 2)
sns.boxplot(data=credential_df, x='device_type', y='amount_ratio')
plt.title('Amount Ratio by Device (Credential Stuffing)')
plt.show()

In [None]:
# ============================================================
# PATTERN 2: SIM SWAP (Impossible Travel)
# ============================================================

simswap_df = df_full[df_full['fraud_type'] == 'sim_swap']

print("SIM Swap Analysis:")
print(f"Total attacks: {len(simswap_df)}")
print(f"Country changes: {simswap_df['country_changed'].sum()}")
print(f"Quick country changes (<2h): {simswap_df['quick_country_change'].sum()}")

# Show top impossible travel cases
simswap_risky = simswap_df[simswap_df['quick_country_change'] == 1].head(10)
display(simswap_risky[['user_id', 'timestamp', 'merchant_country', 'time_diff_seconds', 'amount']])

# Timeline visualization for a specific SIM swap case
user_example = simswap_df['user_id'].iloc[0]
user_timeline = df_full[df_full['user_id'] == user_example].sort_values('timestamp')

fig = px.timeline(
    user_timeline, 
    x_start="timestamp", 
    x_end="timestamp", 
    y="merchant_country",
    color="is_fraud",
    title=f"SIM Swap Timeline - User {user_example}",
    color_discrete_map={0: 'blue', 1: 'red'}
)
fig.show()

In [None]:
# ============================================================
# PATTERN 3: KEYLOGGER (Behavioral Divergence)
# ============================================================

keylogger_df = df_full[df_full['fraud_type'] == 'keylogger']

print("Keylogger Analysis:")
print(f"Total attacks: {len(keylogger_df)}")
print(f"Hour deviation from user normal: {keylogger_df['hour'].mean():.1f}")
print(f"Amount ratio: {keylogger_df['amount_ratio'].mean():.2f}")

# Compare user behavior before/after compromise
compromised_users = df_full[df_full['user_id'].isin(keylogger_df['user_id'].unique())]
pre_compromise = compromised_users[compromised_users['timestamp'] < compromised_users[compromised_users['is_fraud']==1]['timestamp'].min()]
post_compromise = compromised_users[compromised_users['is_fraud']==1]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Hour of day
sns.histplot(pre_compromise['hour'], bins=24, alpha=0.7, label='Pre-Compromise', ax=axes[0])
sns.histplot(post_compromise['hour'], bins=24, alpha=0.7, label='Keylogger', ax=axes[0])
axes[0].set_title('Hour of Day Shift')
axes[0].legend()

# Amount
sns.histplot(np.log1p(pre_compromise['amount']), bins=30, alpha=0.7, label='Pre-Compromise', ax=axes[1])
sns.histplot(np.log1p(post_compromise['amount']), bins=30, alpha=0.7, label='Keylogger', ax=axes[1])
axes[1].set_title('Amount Distribution Shift (Log)')
axes[1].legend()

# Device
sns.countplot(data=compromised_users, x='device_type', hue='is_fraud', ax=axes[2])
axes[2].set_title('Device Usage Shift')
axes[2].legend(title='Fraud')

plt.tight_layout()
plt.show()

In [None]:
# ============================================================
# PATTERN 4: BRUTE FORCE (Extreme Volume)
# ============================================================

bruteforce_df = df_full[df_full['fraud_type'] == 'brute_force']

print("Brute Force Analysis:")
print(f"Total attacks: {len(bruteforce_df)}")
print(f"Avg tx per user: {bruteforce_df.groupby('user_id').size().mean():.1f}")
print(f"High velocity (tx_count_1h > 5): {bruteforce_df['tx_count_1h'] > 5}.sum()}")

# Velocity analysis
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(bruteforce_df['tx_count_24h'], bins=20, color='purple', alpha=0.7)
plt.title('Brute Force: 24h Velocity Distribution')
plt.xlabel('Transactions in last 24h')

plt.subplot(1, 2, 2)
normal_velocity = df_full[df_full['is_fraud']==0]['tx_count_24h']
brute_velocity = bruteforce_df['tx_count_24h']
sns.boxplot(data=pd.DataFrame({'Normal': normal_velocity, 'Brute Force': brute_velocity}))
plt.title('24h Velocity: Normal vs Brute Force')
plt.ylabel('Tx Count')
plt.show()

In [None]:
# ============================================================
# NETWORK ANALYSIS: Attack Relationships
# ============================================================

# Create network of Fraud transactions
fraud_network = df_full[df_full['is_fraud']==1].copy()
G = nx.Graph()

# Add edges between transactions from same user or same country in short time
for _, row in fraud_network.iterrows():
    G.add_node(row['transaction_id'], type=row['fraud_type'], country=row['merchant_country'])
    
    # Connect to previous transaction of same user
    prev_tx = fraud_network[
        (fraud_network['user_id'] == row['user_id']) & 
        (fraud_network['timestamp'] < row['timestamp'])
    ].tail(1)
    
    if len(prev_tx) > 0:
        G.add_edge(prev_tx.iloc[0]['transaction_id'], row['transaction_id'])

# Visualization (top 50 nodes for readability)
top_nodes = sorted(G.nodes(data=True), key=lambda x: G.degree(x[0]), reverse=True)[:50]
subG = G.subgraph([n[0] for n in top_nodes])

plt.figure(figsize=(12, 8))
pos = nx.spring_layout(subG, k=1, iterations=50)
node_colors = [G.nodes[n]['type'] for n in subG.nodes]
nx.draw(subG, pos, node_color=node_colors, with_labels=False, 
        node_size=300, cmap=plt.cm.Set1, edge_color='gray', alpha=0.7)
plt.title('Fraud Transaction Network (Top 50 Nodes)')
plt.axis('off')
plt.show()

In [None]:
# ============================================================
# ATTACK PATTERN SUMMARY
# ============================================================

pattern_summary = df_full[df_full['is_fraud']==1].groupby('fraud_type').agg({
    'amount': ['count', 'mean', 'median'],
    'amount_ratio': 'mean',
    'tx_count_24h': 'mean',
    'country_changed': 'mean',
    'device_changed': 'mean',
    'quick_country_change': 'mean',
    'user_id': 'nunique'
}).round(3)

pattern_summary.columns = ['Attack Count', 'Avg Amount', 'Median Amount', 
                         'Avg Amount Ratio', 'Avg 24h Velocity', 
                         'Country Change Rate', 'Device Change Rate',
                         'Impossible Travel Rate', 'Unique Victims']

print("ATO Attack Pattern Summary:")
display(pattern_summary)

# Save for executive summary
pattern_summary.to_csv('../data/processed/pattern_summary.csv')
print("Pattern summary saved")