In [None]:
# ============================================================
# NOTEBOOK 1: DATA SIMULATION - ATO Risk Profiler
# ============================================================
# Goal: Generate 100k+ realistic transactions containing:
#   - Legitimate users with normal behavior patterns
#   - Compromised accounts (ATO) with anomalous behavior
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import os

# Visual configuration
sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

# Set seed for total reproducibility
np.random.seed(42)

# Pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Environment set up correctly")
print(f"Generation Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# ============================================================
# SIMULATION PARAMETERS
# ============================================================

# Dataset dimensions
N_USERS = 10000
N_TRANSACTIONS = 100000
FRAUD_RATE = 0.03
DATE_RANGE_DAYS = 90

# ATO Fraud Profile (Types and distribution)
FRAUD_TYPES = {
    'credential_stuffing': 0.40,
    'sim_swap': 0.25,
    'keylogger': 0.20,
    'brute_force': 0.15
}

# Simulated Geographic and Temporal Data
COUNTRIES = ['DE', 'US', 'MX', 'AR', 'BR', 'ES', 'IT', 'UK', 'FR', 'CA']
TIMEZONES = [-8, -7, -6, -5, -4, -3, 0, 1, 2]

print("Simulation Parameters:")
print(f"   - Users: {N_USERS:,}")
print(f"   - Transactions: {N_TRANSACTIONS:,}")
print(f"   - Target Fraud Rate: {FRAUD_RATE*100}%")
print(f"   - ATO Types: {list(FRAUD_TYPES.keys())}")

In [None]:
def generate_legitimate_user(user_id):
    """
    Creates the base profile of a legitimate user.
    Defines normal behavior (country, device, preferred hour).
    """
    country = np.random.choice(COUNTRIES)
    
    return {
        'user_id': user_id,
        'country': country,
        'preferred_device': np.random.choice(['mobile', 'desktop', 'tablet'], p=[0.6, 0.35, 0.05]),
        'preferred_hour': int(np.random.normal(14, 4)) % 24,
        'avg_amount': np.random.lognormal(3.5, 1.0),
        'tx_frequency': np.random.poisson(3),
        'is_compromised': False
    }

# Generate user base
users_db = {i: generate_legitimate_user(i) for i in range(N_USERS)}

# Convert to DataFrame for preview
users_df = pd.DataFrame(users_db.values())
print(f"{len(users_df)} users generated.")
display(users_df.head(3))

In [None]:
def generate_transaction(tx_id, user, is_fraud=False, fraud_type=None):
    """
    Generates a single transaction based on the user's profile.
    If is_fraud=True, parameters are altered according to the ATO attack type.
    """
    # Random base date within range
    base_date = datetime(2024, 1, 1)
    random_seconds = np.random.randint(0, DATE_RANGE_DAYS * 86400)
    timestamp = base_date + timedelta(seconds=random_seconds)
    
    # Default values (Normal behavior)
    amount = user['avg_amount'] * np.random.uniform(0.8, 1.2)
    device = user['preferred_device']
    country = user['country']
    hour = timestamp.hour
    
    # ATO Fraud Logic
    if is_fraud:
        if fraud_type == 'credential_stuffing':
            amount = user['avg_amount'] * np.random.uniform(2, 5)
            device = np.random.choice(['desktop', 'mobile'])
            
        elif fraud_type == 'sim_swap':
            country = np.random.choice([c for c in COUNTRIES if c != user['country']])
            device = 'mobile'
            amount = user['avg_amount'] * np.random.uniform(1.5, 3)
            
        elif fraud_type == 'brute_force':
            amount = np.random.uniform(1, 10)
            
        elif fraud_type == 'keylogger':
            hour = (user['preferred_hour'] + 12) % 24
            timestamp = timestamp.replace(hour=hour)

    # Rounding and adjustments
    amount = round(max(0.01, amount), 2)
    
    return {
        'transaction_id': f"TX-{tx_id:08d}",
        'user_id': user['user_id'],
        'timestamp': timestamp,
        'amount': amount,
        'currency': 'EUR',
        'merchant_country': country,
        'device_type': device,
        'is_fraud': 1 if is_fraud else 0,
        'fraud_type': fraud_type if is_fraud else 'None'
    }

print("Transaction generation function compiled.")

In [None]:
# Lists to store data
transactions = []

# 1. Select Victims (Compromised Users)
n_compromised = int(N_USERS * 0.05)
compromised_users = np.random.choice(list(users_db.keys()), size=n_compromised, replace=False)

# 2. Generate Fraudulent Transactions (ATO)
n_frauds = int(N_TRANSACTIONS * FRAUD_RATE)
for i in range(n_frauds):
    user_id = np.random.choice(compromised_users)
    f_type = np.random.choice(list(FRAUD_TYPES.keys()), p=list(FRAUD_TYPES.values()))
    
    tx = generate_transaction(i, users_db[user_id], is_fraud=True, fraud_type=f_type)
    transactions.append(tx)

# 3. Generate Legitimate Transactions
n_legit = N_TRANSACTIONS - n_frauds
for i in range(n_frauds, N_TRANSACTIONS):
    user_id = np.random.choice(list(users_db.keys()))
    tx = generate_transaction(i, users_db[user_id], is_fraud=False)
    transactions.append(tx)

# Create DataFrame and sort by date
df = pd.DataFrame(transactions)
df = df.sort_values('timestamp').reset_index(drop=True)

print(f"Dataset Generated Successfully!")
print(f"   Dimensions: {df.shape}")
print(f"   Date Range: {df['timestamp'].min()} to {df['timestamp'].max()}")

In [None]:
# Quick Validation
print("Fraud Distribution:")
print(df['is_fraud'].value_counts(normalize=True))

print("\nATO Attack Types:")
print(df[df['is_fraud']==1]['fraud_type'].value_counts())

# Save Data
os.makedirs('../data', exist_ok=True)
df.to_csv('../data/simulated_transactions.csv', index=False)

print("\nFile saved: 'data/simulated_transactions.csv'")