# Enhanced Realistic Transaction Data Generator
This notebook generates realistic synthetic transaction data for ML model development.
- Card-type-dependent fraud logic
- ISO codes, customer info, fake merchant names
- Customizable fraud logic and periodic CSV appending

In [None]:
import pandas as pd
import numpy as np
import random
import string
import time
from datetime import datetime
import os

output_file = 'enhanced_transactions.csv'

params = {
    'num_rows': 100,
    'interval_seconds': 10,
    'fraud_rate': 0.03,
    'high_risk_merchants': [101, 202, 303],
    'merchant_ids': list(range(100, 120)),
    'merchant_names': [
        'Acme Electronics', 'FastMart', 'Skyline Fashion', 'Globex Grocery', 'Voyage Travel',
        'NextGen Gaming', 'QuickFuel', 'ClicknBuy', 'UrbanStyle', 'ZoomMobility'
    ],
    'customer_segments': ['standard', 'premium', 'vip'],
    'channels': ['online', 'instore'],
    'currencies': ['USD', 'EUR', 'GBP'],
    'countries': ['US', 'GB', 'DE', 'FR'],
    'device_types': ['mobile', 'desktop', 'tablet'],
    'card_types': ['debit', 'credit'],
    'merchant_categories': [
        {'code': 5411, 'name': 'Grocery Stores'},
        {'code': 5812, 'name': 'Restaurants'},
        {'code': 5732, 'name': 'Electronics Stores'},
        {'code': 5999, 'name': 'Retail – Misc'},
        {'code': 4111, 'name': 'Transportation'},
        {'code': 4812, 'name': 'Telecom Services'},
        {'code': 7011, 'name': 'Hotels'},
        {'code': 6011, 'name': 'ATM/Cash'}
    ]
}

def random_string(length=8):
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))

def generate_transaction(i):
    merchant_id = random.choice(params['merchant_ids'])
    merchant_name = random.choice(params['merchant_names'])
    amount = np.random.exponential(scale=200)
    channel = random.choice(params['channels'])
    country = random.choice(params['countries'])
    currency = random.choice(params['currencies'])
    device = random.choice(params['device_types'])
    card = random.choice(params['card_types'])
    segment = random.choice(params['customer_segments'])
    'merchant_categories': [
        {'code': 5411, 'name': 'Grocery Stores'},
        {'code': 5812, 'name': 'Restaurants'},
        {'code': 5732, 'name': 'Electronics Stores'},
        {'code': 5999, 'name': 'Retail – Misc'},
        {'code': 4111, 'name': 'Transportation'},
        {'code': 4812, 'name': 'Telecom Services'},
        {'code': 7011, 'name': 'Hotels'},
        {'code': 6011, 'name': 'ATM/Cash'}
    ]
    customer_id = f"C{random.randint(10000, 99999)}"
    customer_name = f"{random.choice(['Alice', 'Bob', 'Charlie', 'Diana', 'Evan'])} {random.choice(['Smith', 'Brown', 'Taylor', 'Lee'])}"
    customer_email = f"{customer_name.split()[0].lower()}.{random.randint(1,999)}@example.com"
    attempts = random.randint(1, 6)

    # Fraud logic
    fraud_prob = params['fraud_rate']
    if merchant_id in params['high_risk_merchants']:
        fraud_prob += 0.04
    if amount > 1000:
        fraud_prob += 0.02
    if card == 'debit':
        fraud_prob += 0.03
    if attempts > 3:
        fraud_prob += 0.03

    return {
        'transaction_id': f"T{int(time.time())}_{i}",
        'timestamp': datetime.now().isoformat(),
        'customer_id': customer_id,
        'customer_name': customer_name,
        'customer_email': customer_email,
        'merchant_id': merchant_id,
        'merchant_name': merchant_name,
        'merchant_category_code': mcc['code'],
        'merchant_category_name': mcc['name'],
        'amount': round(amount, 2),
        'currency': currency,
        'channel': channel,
        'country_code': country,
        'device_type': device,
        'card_type': card,
        'customer_segment': segment,
        'attempts': attempts,
        'fraud': int(random.random() < min(fraud_prob, 0.99))
    }


In [None]:
# Data generation loop
while True:
    batch = [generate_transaction(i) for i in range(params['num_rows'])]
    df = pd.DataFrame(batch)
    if not os.path.exists(output_file):
        df.to_csv(output_file, index=False)
    else:
        df.to_csv(output_file, mode='a', header=False, index=False)
    print(f"Appended {params['num_rows']} rows to {output_file}")
    time.sleep(params['interval_seconds'])
