In [23]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from collections import defaultdict
import os

np.random.seed(42)
random.seed(42)

# Constants
NUM_CARDS = 100
TXNS_PER_CARD = 500
CARD_BRANDS = ['VISA', 'MASTERCARD', 'AMEX', 'DISCOVER']
CURRENCIES = ['USD', 'EUR', 'GBP']
START_DATE = datetime(2024, 1, 1)
TEST_START_DATE = datetime(2025, 1, 1)

TRAIN_CSV = './SML/training_recovery_data.csv'
TEST_CSV = './SML/test_recovery_data.csv'

# Generate cards with fixed brands
cards = {}
for i in range(NUM_CARDS):
    card_number = f'card_{i}'
    card_brand = CARD_BRANDS[i % len(CARD_BRANDS)]  # Evenly distribute brands
    cards[card_number] = card_brand

# Store per-card patterns from training
card_profiles = defaultdict(lambda: {'weekday_probs': np.ones(7), 'month_probs': np.ones(12)})

# Helper to simulate transaction based on card profile
def simulate_transaction(card_number, card_brand, start_date, profile, success_prob=0.85):
    while True:
        day_offset = random.randint(0, 364)
        txn_date = start_date + timedelta(days=day_offset)
        month = txn_date.month
        weekday = txn_date.weekday()

        prob = profile['month_probs'][month - 1] * profile['weekday_probs'][weekday]
        if random.random() < min(prob / np.max(profile['month_probs']) / np.max(profile['weekday_probs']), 1.0):
            break

    currency = random.choice(CURRENCIES)
    amount = round(random.uniform(5.0, 1000.0), 2)
    status = np.random.choice(['success', 'failure'], p=[success_prob, 1 - success_prob])

    return {
        'card_number': card_number,
        'card_brand': card_brand,
        'currency_code': currency,
        'transaction_amount': amount,
        'txn_date': txn_date.date(),
        'transaction_status': status
    }

# Generate training and test datasets grouped by card
train_data = []
test_data = []

for card_number, card_brand in cards.items():
    # Inject behavior patterns by card_number suffix
    profile = card_profiles[card_number]

    if card_number.endswith(('0', '1', '2')):
        profile['weekday_probs'] = np.array([0.05, 0.05, 0.1, 0.1, 0.2, 0.25, 0.25])  # Weekend spending
    elif card_number.endswith(('3', '4', '5')):
        profile['month_probs'][10] *= 2  # Nov
        profile['month_probs'][11] *= 2  # Dec
    elif card_number.endswith(('6', '7')):
        profile['month_probs'] += np.sin(np.linspace(0, 3.14, 12))
    elif card_number.endswith(('8', '9')):
        custom_success_prob = 0.6
    else:
        custom_success_prob = 0.85

    # Generate training transactions
    train_txns = [simulate_transaction(card_number, card_brand, START_DATE, profile, success_prob=custom_success_prob if 'custom_success_prob' in locals() else 0.85) for _ in range(TXNS_PER_CARD)]
    df_train_card = pd.DataFrame(train_txns)

    weekdays = pd.to_datetime(df_train_card['txn_date']).dt.weekday.value_counts(normalize=True).sort_index()
    months = pd.to_datetime(df_train_card['txn_date']).dt.month.value_counts(normalize=True).sort_index()

    profile['weekday_probs'] = weekdays.reindex(range(7), fill_value=1/7).values
    profile['month_probs'] = months.reindex(range(1, 13), fill_value=1/12).values

    train_data.extend(train_txns)

    # Generate test transactions using learned profile
    test_txns = [simulate_transaction(card_number, card_brand, TEST_START_DATE, profile, success_prob=custom_success_prob if 'custom_success_prob' in locals() else 0.85) for _ in range(int(TXNS_PER_CARD * 0.2))]
    test_data.extend(test_txns)

    if 'custom_success_prob' in locals():
        del custom_success_prob

# Final sort to ensure card-wise grouping with chronological order within
train_df = pd.DataFrame(train_data)
train_df.sort_values(by=['card_number', 'txn_date'], inplace=True)

if os.path.exists(TRAIN_CSV):
    os.remove(TRAIN_CSV)
train_df.to_csv(TRAIN_CSV, index=False)
print(f"Training dataset saved as '{TRAIN_CSV}' with shape:", train_df.shape)

test_df = pd.DataFrame(test_data)
test_df.sort_values(by=['card_number', 'txn_date'], inplace=True)

if os.path.exists(TEST_CSV):
    os.remove(TEST_CSV)
test_df.to_csv(TEST_CSV, index=False)
print(f"Test dataset saved as '{TEST_CSV}' with shape:", test_df.shape)

print(train_df.head())
print(test_df.head())

Training dataset saved as './SML/training_recovery_data.csv' with shape: (50000, 6)
Test dataset saved as './SML/test_recovery_data.csv' with shape: (10000, 6)
    card_number card_brand currency_code  transaction_amount    txn_date  \
188      card_0       VISA           GBP              779.42  2024-01-05   
394      card_0       VISA           GBP              111.76  2024-01-05   
21       card_0       VISA           USD              683.30  2024-01-06   
118      card_0       VISA           EUR              110.01  2024-01-07   
217      card_0       VISA           USD              131.12  2024-01-09   

    transaction_status  
188            success  
394            success  
21             success  
118            failure  
217            success  
   card_number card_brand currency_code  transaction_amount    txn_date  \
73      card_0       VISA           USD              248.33  2025-01-02   
47      card_0       VISA           EUR              818.02  2025-01-05   
75      