In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import uuid

# Initialize Faker and set seeds for reproducibility
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

print("Libraries loaded successfully!")

Libraries loaded successfully!


In [2]:
NUM_USERS = 1000
NUM_MERCHANTS = 100

# 1. Generate Users
users = []
for _ in range(NUM_USERS):
    users.append({
        'user_id': str(uuid.uuid4())[:8],
        'home_location': fake.city(),
        'device_id': str(uuid.uuid4())[:12]
    })
users_df = pd.DataFrame(users)

# 2. Generate Merchants
merchants = []
for _ in range(NUM_MERCHANTS):
    merchants.append({
        'merchant_id': str(uuid.uuid4())[:8],
        'merchant_location': fake.city()
    })
merchants_df = pd.DataFrame(merchants)

payment_methods = ['Credit Card', 'Debit Card', 'Digital Wallet', 'Bank Transfer']

print(f"Generated {len(users_df)} users and {len(merchants_df)} merchants.")

Generated 1000 users and 100 merchants.


In [3]:
NUM_NORMAL_TXNS = 10000

normal_transactions = []
current_time = datetime(2024, 1, 1, 8, 0, 0)

for _ in range(NUM_NORMAL_TXNS):
    user = random.choice(users)
    merchant = random.choice(merchants)
    
    # Normal behavior: Amounts mostly under $150, using home location and primary device
    amount = round(np.random.lognormal(mean=3.0, sigma=0.8), 2)
    
    # Increment time to keep it time-ordered
    current_time += timedelta(minutes=random.randint(1, 15))
    
    normal_transactions.append({
        'transaction_id': str(uuid.uuid4()),
        'user_id': user['user_id'],
        'merchant_id': merchant['merchant_id'],
        'amount': amount,
        'timestamp': current_time,
        'location': user['home_location'],
        'payment_method': random.choice(payment_methods),
        'device_id': user['device_id'],
        'is_fraud': 0
    })

df_normal = pd.DataFrame(normal_transactions)
print(f"Generated {len(df_normal)} normal transactions.")

Generated 10000 normal transactions.


In [4]:
fraud_transactions = []

# Pattern 1: Sudden Amount Spikes (150 transactions)
# Fraudsters making massive purchases on a compromised account
for _ in range(150):
    user = random.choice(users)
    merchant = random.choice(merchants)
    
    amount = round(random.uniform(1000.0, 5000.0), 2) # Abnormally high amount
    current_time += timedelta(minutes=random.randint(1, 5))
    
    fraud_transactions.append({
        'transaction_id': str(uuid.uuid4()),
        'user_id': user['user_id'],
        'merchant_id': merchant['merchant_id'],
        'amount': amount,
        'timestamp': current_time,
        'location': user['home_location'],
        'payment_method': random.choice(payment_methods),
        'device_id': user['device_id'],
        'is_fraud': 1
    })

# Pattern 2: Location Inconsistencies & Unrecognized Devices (150 transactions)
# Fraudsters using a different device in a completely different city
for _ in range(150):
    user = random.choice(users)
    merchant = random.choice(merchants)
    
    amount = round(random.uniform(50.0, 500.0), 2)
    current_time += timedelta(minutes=random.randint(1, 5))
    
    fraud_transactions.append({
        'transaction_id': str(uuid.uuid4()),
        'user_id': user['user_id'],
        'merchant_id': merchant['merchant_id'],
        'amount': amount,
        'timestamp': current_time,
        'location': fake.city(), # Random city, not their home_location
        'payment_method': random.choice(payment_methods),
        'device_id': str(uuid.uuid4())[:12], # Completely new device
        'is_fraud': 1
    })

df_fraud = pd.DataFrame(fraud_transactions)
print(f"Generated {len(df_fraud)} fraudulent transactions.")

Generated 300 fraudulent transactions.


In [5]:
# Combine datasets
df_final = pd.concat([df_normal, df_fraud], ignore_index=True)

# Sort strictly by timestamp to ensure realistic time-series data
df_final = df_final.sort_values(by='timestamp').reset_index(drop=True)

# Save to CSV
df_final.to_csv('synthetic_transactions.csv', index=False)

# Display a quick summary
print("Dataset finalized and saved as 'synthetic_transactions.csv'.")
print("-" * 30)
print(f"Total Transactions: {len(df_final)}")
print(f"Fraud Rate: {(df_final['is_fraud'].sum() / len(df_final)) * 100:.2f}%")
display(df_final.head())

Dataset finalized and saved as 'synthetic_transactions.csv'.
------------------------------
Total Transactions: 10300
Fraud Rate: 2.91%


Unnamed: 0,transaction_id,user_id,merchant_id,amount,timestamp,location,payment_method,device_id,is_fraud
0,bd63fb68-c73b-4c31-9538-92ce0b35bc3a,56b7f72e,f0e3c528,29.89,2024-01-01 08:01:00,New Danielside,Digital Wallet,532d1689-1f4,0
1,a1657ffe-d192-4af1-9106-6160845f2f50,c708809c,1b55d9d5,17.98,2024-01-01 08:04:00,Robertfort,Credit Card,3103e940-c64,0
2,ba1c58ba-3ff7-4b3a-bbdc-23b4eb9e416e,34f193a4,bf28de97,33.72,2024-01-01 08:19:00,West Nicholas,Credit Card,92c43622-c34,0
3,76d8bd13-ccaf-4ec5-8152-9ce83d584029,02f630d9,320e31c7,67.93,2024-01-01 08:20:00,Barrmouth,Credit Card,200d0f18-7a6,0
4,4c3c406e-b998-44bc-964e-574604e75e0b,3d330dd9,df9d39e9,16.65,2024-01-01 08:24:00,North Cynthiaview,Credit Card,77aa44a8-04c,0
