In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random, os, zipfile

# ============================================================
# CONFIG
# ============================================================
random.seed(42)
np.random.seed(42)

N_BOOKINGS = 8000
N_USERS = 800
N_FRAUD_USERS = 40
TARGET_FRAUD_BOOKINGS = 720

N_AGENCIES = 200
N_DEVICES = 500
N_IPS = 600
START_DATE = datetime(2025, 1, 1)

OUT_DIR = "b2b_travel_userfraud_synth_v6_balanced_sticky"
ZIP_NAME = f"{OUT_DIR}.zip"
os.makedirs(OUT_DIR, exist_ok=True)

# ============================================================
# 1) AGENCY MASTER
# ============================================================
agency_ids = [f"A{str(i).zfill(4)}" for i in range(1, N_AGENCIES + 1)]
agency_master = pd.DataFrame({
    "agency_id": agency_ids,
    "country": np.random.choice(["IN", "AE", "SG", "UK", "US"], size=N_AGENCIES,
                                p=[0.45, 0.15, 0.10, 0.15, 0.15]),
    "agency_age_days": np.random.randint(30, 3000, size=N_AGENCIES),
    "kyc_status": np.random.choice(["verified", "pending", "failed"], size=N_AGENCIES,
                                   p=[0.85, 0.12, 0.03]),
    "credit_limit": np.random.choice([5000, 10000, 25000, 50000, 100000, 200000, 500000],
                                     size=N_AGENCIES,
                                     p=[0.20, 0.20, 0.20, 0.15, 0.15, 0.07, 0.03]),
    "status": np.random.choice(["active", "suspended"], size=N_AGENCIES, p=[0.97, 0.03])
})

# ============================================================
# 2) USER MASTER (balanced fraud user types)
# ============================================================
user_ids = [f"U{str(i).zfill(5)}" for i in range(1, N_USERS + 1)]
user_agencies = np.random.choice(agency_ids, size=N_USERS)

fraud_user_ids = np.random.choice(user_ids, size=N_FRAUD_USERS, replace=False)
fraud_user_set = set(fraud_user_ids)

fraud_type_counts = {
    "account_takeover": 7,
    "ring_operator": 7,
    "bot_booking": 7,
    "cancellation_abuser": 6,
    "credit_bustout_user": 7,
    "new_synthetic_user": 6
}
assert sum(fraud_type_counts.values()) == N_FRAUD_USERS
assert min(fraud_type_counts.values()) >= 3

balanced_fraud_types = []
for k, v in fraud_type_counts.items():
    balanced_fraud_types.extend([k] * v)

random.shuffle(balanced_fraud_types)
fraud_type_map = dict(zip(fraud_user_ids, balanced_fraud_types))

roles = np.random.choice(["agent", "admin", "finance"], size=N_USERS, p=[0.78, 0.12, 0.10])

# Base values (you said user table was perfect, keeping same style)
user_age_days = np.random.randint(30, 2500, size=N_USERS)
avg_logins = np.clip(np.random.normal(6, 3.5, size=N_USERS), 0, 60)
failed_login_ratio = np.clip(np.random.beta(1.5, 40, size=N_USERS), 0, 1)
account_status = np.random.choice(["active", "locked"], size=N_USERS, p=[0.985, 0.015])

# Fraud signals in user table
for i, uid in enumerate(user_ids):
    if uid in fraud_user_set:
        ftype = fraud_type_map[uid]

        if ftype == "account_takeover":
            failed_login_ratio[i] = np.random.uniform(0.18, 0.55)
            avg_logins[i] = np.random.uniform(10, 25)

        elif ftype == "bot_booking":
            failed_login_ratio[i] = np.random.uniform(0.05, 0.20)
            avg_logins[i] = np.random.uniform(25, 60)

        elif ftype == "ring_operator":
            failed_login_ratio[i] = np.random.uniform(0.03, 0.14)
            avg_logins[i] = np.random.uniform(12, 28)

        elif ftype == "cancellation_abuser":
            failed_login_ratio[i] = np.random.uniform(0.01, 0.10)
            avg_logins[i] = np.random.uniform(8, 18)

        elif ftype == "credit_bustout_user":
            failed_login_ratio[i] = np.random.uniform(0.01, 0.08)
            avg_logins[i] = np.random.uniform(10, 22)

        elif ftype == "new_synthetic_user":
            user_age_days[i] = np.random.randint(3, 40)
            failed_login_ratio[i] = np.random.uniform(0.02, 0.18)
            avg_logins[i] = np.random.uniform(12, 40)

        if np.random.rand() < 0.05:
            account_status[i] = "locked"

user_master = pd.DataFrame({
    "user_id": user_ids,
    "agency_id": user_agencies,
    "role": roles,
    "user_age_days": user_age_days,
    "avg_logins_per_day": np.round(avg_logins, 2),
    "failed_login_ratio": np.round(failed_login_ratio, 4),
    "account_status": account_status,
    "user_fraud_label": [1 if u in fraud_user_set else 0 for u in user_ids],
    "user_fraud_type": [fraud_type_map[u] if u in fraud_user_set else "legit" for u in user_ids],
})

user_to_agency = dict(zip(user_master["user_id"], user_master["agency_id"]))

# ============================================================
# 3) DEVICE MASTER
# ============================================================
device_ids = [f"D{str(i).zfill(5)}" for i in range(1, N_DEVICES + 1)]
device_master = pd.DataFrame({
    "device_id": device_ids,
    "device_type": np.random.choice(["desktop", "mobile"], size=N_DEVICES, p=[0.7, 0.3]),
    "os": np.random.choice(["Windows", "macOS", "Android", "iOS"], size=N_DEVICES,
                           p=[0.45, 0.20, 0.25, 0.10]),
    "browser": np.random.choice(["Chrome", "Safari", "Edge", "Firefox"], size=N_DEVICES,
                                p=[0.65, 0.12, 0.15, 0.08]),
    "first_seen_ts": [START_DATE + timedelta(days=int(x)) for x in np.random.randint(0, 365, size=N_DEVICES)]
})

fraud_device_pool = np.random.choice(device_ids, size=20, replace=False)
normal_device_pool = [d for d in device_ids if d not in set(fraud_device_pool)]

# ============================================================
# 4) IP MASTER
# ============================================================
ip_ids = [f"IP{str(i).zfill(5)}" for i in range(1, N_IPS + 1)]

def random_ip():
    return ".".join(map(str, np.random.randint(1, 255, size=4)))

ip_master = pd.DataFrame({
    "ip_id": ip_ids,
    "ip_address": [random_ip() for _ in range(N_IPS)],
    "country": np.random.choice(["IN", "AE", "SG", "UK", "US", "RO", "RU"], size=N_IPS,
                                p=[0.40, 0.10, 0.10, 0.15, 0.15, 0.05, 0.05]),
    "asn": np.random.randint(1000, 99999, size=N_IPS),
    "is_proxy": np.random.choice([0, 1], size=N_IPS, p=[0.93, 0.07]),
    "first_seen_ts": [START_DATE + timedelta(days=int(x)) for x in np.random.randint(0, 365, size=N_IPS)]
})

fraud_ip_pool = np.random.choice(ip_ids, size=24, replace=False)
normal_ip_pool = [i for i in ip_ids if i not in set(fraud_ip_pool)]
ip_master.loc[ip_master["ip_id"].isin(fraud_ip_pool), "is_proxy"] = np.random.choice(
    [0, 1], size=len(fraud_ip_pool), p=[0.45, 0.55]
)

# ============================================================
# 5) BOOKINGS (choose fraud bookings + sticky IP/Device assignment)
# ============================================================
booking_ids = [f"B{str(i).zfill(6)}" for i in range(1, N_BOOKINGS + 1)]
booking_ts = [START_DATE + timedelta(minutes=int(x)) for x in np.random.randint(0, 365 * 24 * 60, size=N_BOOKINGS)]

product_type = np.random.choice(["flight", "hotel", "package"], size=N_BOOKINGS, p=[0.6, 0.3, 0.1])
route_type = np.random.choice(["domestic", "international"], size=N_BOOKINGS, p=[0.65, 0.35])
origin_country = np.random.choice(["IN", "AE", "SG", "UK", "US"], size=N_BOOKINGS,
                                  p=[0.45, 0.15, 0.10, 0.15, 0.15])
dest_country = np.where(route_type == "domestic", origin_country,
                        np.random.choice(["IN", "AE", "SG", "UK", "US", "TH", "FR", "DE"], size=N_BOOKINGS))
lead_time_days = np.where(route_type == "domestic",
                          np.random.randint(1, 20, size=N_BOOKINGS),
                          np.random.randint(2, 90, size=N_BOOKINGS))

def gen_value(rt):
    if rt == "domestic":
        return float(np.clip(np.random.lognormal(mean=5.3, sigma=0.55), 40, 12000))
    return float(np.clip(np.random.lognormal(mean=7.2, sigma=0.65), 200, 25000))

booking_value = np.array([gen_value(rt) for rt in route_type])
passengers_count = np.random.choice([1,2,3,4,5,6,7,8,9,10], size=N_BOOKINGS,
                                    p=[0.35,0.25,0.15,0.08,0.05,0.04,0.03,0.025,0.015,0.01])
payment_method = np.random.choice(["credit_line", "card", "bank"], size=N_BOOKINGS, p=[0.75, 0.18, 0.07])
booking_status = np.random.choice(["confirmed", "pending", "failed"], size=N_BOOKINGS, p=[0.92, 0.06, 0.02])

# assign which bookings are fraud bookings
fraud_booking_indices = np.random.choice(np.arange(N_BOOKINGS), size=TARGET_FRAUD_BOOKINGS, replace=False)
fraud_booking_set = set(fraud_booking_indices)

# pick which fraud users actually get bookings (some fraud users can have 0 bookings)
fraud_users_list = list(fraud_user_ids)
fraud_users_for_bookings = np.random.choice(fraud_users_list, size=32, replace=False)

legit_users_list = [u for u in user_ids if u not in fraud_user_set]

user_id_col = np.empty(N_BOOKINGS, dtype=object)
agency_id_col = np.empty(N_BOOKINGS, dtype=object)
device_id_col = np.empty(N_BOOKINGS, dtype=object)
ip_id_col = np.empty(N_BOOKINGS, dtype=object)

# ----------------------------------------------------------
# Sticky infra pools per user (THIS IS THE FIX)
# ----------------------------------------------------------
user_sticky_ips = {}
user_sticky_devices = {}

# GOOD users: 1 stable device + 1-2 stable IPs
for u in legit_users_list:
    user_sticky_devices[u] = [np.random.choice(normal_device_pool)]
    k_ips = np.random.choice([1, 2], p=[0.85, 0.15])
    user_sticky_ips[u] = list(np.random.choice(normal_ip_pool, size=k_ips, replace=False))

# FRAUD users: variety depends on fraud type
for u in fraud_users_list:
    utype = user_master.loc[user_master["user_id"] == u, "user_fraud_type"].values[0]

    if utype in ["ring_operator", "bot_booking"]:
        k_dev = np.random.choice([2, 3, 4], p=[0.30, 0.50, 0.20])
        k_ip  = np.random.choice([3, 4, 5], p=[0.30, 0.50, 0.20])
    elif utype in ["account_takeover", "new_synthetic_user"]:
        k_dev = np.random.choice([2, 3], p=[0.60, 0.40])
        k_ip  = np.random.choice([2, 3, 4], p=[0.40, 0.40, 0.20])
    elif utype == "cancellation_abuser":
        k_dev = np.random.choice([1, 2], p=[0.70, 0.30])
        k_ip  = np.random.choice([1, 2], p=[0.70, 0.30])
    elif utype == "credit_bustout_user":
        k_dev = np.random.choice([1, 2], p=[0.60, 0.40])
        k_ip  = np.random.choice([2, 3], p=[0.60, 0.40])
    else:
        k_dev, k_ip = 2, 2

    user_sticky_devices[u] = list(np.random.choice(device_ids, size=k_dev, replace=False))
    user_sticky_ips[u] = list(np.random.choice(ip_ids, size=k_ip, replace=False))


# ----------------------------------------------------------
# Booking loop: assign user + sticky infra
# ----------------------------------------------------------
for i in range(N_BOOKINGS):

    # user assignment
    if i in fraud_booking_set:
        u = np.random.choice(fraud_users_for_bookings)
    else:
        u = np.random.choice(legit_users_list)

    user_id_col[i] = u
    agency_id_col[i] = user_to_agency[u]

    utype = user_master.loc[user_master["user_id"] == u, "user_fraud_type"].values[0]

    # FRAUD users: mix shared fraud infra and own pool
    if u in fraud_user_set:
        if utype in ["ring_operator", "bot_booking"]:
            device_id_col[i] = np.random.choice(fraud_device_pool) if np.random.rand() < 0.65 else np.random.choice(user_sticky_devices[u])
            ip_id_col[i]     = np.random.choice(fraud_ip_pool)     if np.random.rand() < 0.65 else np.random.choice(user_sticky_ips[u])

        elif utype in ["account_takeover", "new_synthetic_user"]:
            device_id_col[i] = np.random.choice(fraud_device_pool) if np.random.rand() < 0.55 else np.random.choice(user_sticky_devices[u])
            ip_id_col[i]     = np.random.choice(fraud_ip_pool)     if np.random.rand() < 0.55 else np.random.choice(user_sticky_ips[u])

        else:
            device_id_col[i] = np.random.choice(user_sticky_devices[u])
            ip_id_col[i]     = np.random.choice(user_sticky_ips[u])

    # GOOD users: stable
    else:
        device_id_col[i] = user_sticky_devices[u][0]
        ip_id_col[i] = np.random.choice(user_sticky_ips[u])
        if np.random.rand() < 0.01:
            ip_id_col[i] = np.random.choice(fraud_ip_pool)

    # HARD safety fallback (prevents NaN/None)
    if (ip_id_col[i] is None) or (pd.isna(ip_id_col[i])):
        ip_id_col[i] = np.random.choice(normal_ip_pool)
    if (device_id_col[i] is None) or (pd.isna(device_id_col[i])):
        device_id_col[i] = np.random.choice(normal_device_pool)

booking_fact = pd.DataFrame({
    "booking_id": booking_ids,
    "booking_ts": booking_ts,
    "agency_id": agency_id_col,
    "user_id": user_id_col,
    "device_id": device_id_col,
    "ip_id": ip_id_col,
    "product_type": product_type,
    "route_type": route_type,
    "origin_country": origin_country,
    "dest_country": dest_country,
    "lead_time_days": lead_time_days.astype(int),
    "booking_value": np.round(booking_value, 2),
    "passengers_count": passengers_count.astype(int),
    "payment_method": payment_method,
    "booking_status": booking_status
})

# ============================================================
# 6) POST BOOKING EVENTS (same logic as before)
# ============================================================
post_events = []

for idx, row in booking_fact.iterrows():
    uid = row["user_id"]
    val = float(row["booking_value"])
    is_fraud_booking = idx in fraud_booking_set
    utype = user_master.loc[user_master["user_id"] == uid, "user_fraud_type"].values[0]

    # Base legit
    cancel_p = 0.06
    dispute_p = 0.004

    # Fraud baseline
    if is_fraud_booking:
        cancel_p = 0.12
        dispute_p = 0.03

    # Cancellation abuser
    if utype == "cancellation_abuser":
        cancel_p = 0.70 if not is_fraud_booking else 0.88
        dispute_p = 0.002

    # Credit bustout: high value international + big disputes
    if utype == "credit_bustout_user" and is_fraud_booking:
        cancel_p = 0.07
        dispute_p = 0.18

        booking_fact.loc[idx, "route_type"] = "international"
        booking_fact.loc[idx, "lead_time_days"] = np.random.randint(0, 2)
        booking_fact.loc[idx, "booking_value"] = round(float(np.clip(np.random.normal(18000, 4000), 5000, 25000)), 2)
        booking_fact.loc[idx, "payment_method"] = "credit_line"
        val = float(booking_fact.loc[idx, "booking_value"])

    # Synthetic user: fast lead + moderate disputes
    if utype == "new_synthetic_user" and is_fraud_booking:
        cancel_p = 0.14
        dispute_p = 0.06
        booking_fact.loc[idx, "lead_time_days"] = np.random.randint(0, 4)

    is_cancelled = np.random.rand() < cancel_p
    is_disputed = (not is_cancelled) and (np.random.rand() < dispute_p)

    cancel_delay = int(np.random.randint(0, 3)) if is_cancelled else 0
    dispute_delay = int(np.random.randint(7, 45)) if is_disputed else 0

    chargeback_amount = 0.0
    final_loss_amount = 0.0

    if is_disputed:
        if utype == "credit_bustout_user":
            cb_ratio = np.random.uniform(0.85, 1.00)
            loss_ratio = np.random.uniform(0.65, 0.95)
        elif utype in ["account_takeover", "new_synthetic_user"]:
            cb_ratio = np.random.uniform(0.70, 0.95)
            loss_ratio = np.random.uniform(0.45, 0.85)
        else:
            cb_ratio = np.random.uniform(0.50, 0.90)
            loss_ratio = np.random.uniform(0.25, 0.75)

        chargeback_amount = round(val * cb_ratio, 2)
        final_loss_amount = round(chargeback_amount * loss_ratio, 2)

    post_events.append({
        "event_id": f"E{str(idx+1).zfill(6)}",
        "booking_id": row["booking_id"],
        "is_cancelled": int(is_cancelled),
        "cancel_delay_days": cancel_delay,
        "is_disputed": int(is_disputed),
        "dispute_delay_days": dispute_delay,
        "chargeback_amount": chargeback_amount,
        "final_loss_amount": final_loss_amount
    })

post_booking_events = pd.DataFrame(post_events)

# ============================================================
# 7) BOOKING LABEL TABLE (same mapping)
# ============================================================
booking_label_table = pd.DataFrame({
    "booking_id": booking_fact["booking_id"],
    "fraud_label": 0,
    "fraud_reason": "legit"
})
booking_label_table.loc[list(fraud_booking_set), "fraud_label"] = 1

for idx in fraud_booking_set:
    uid = booking_fact.loc[idx, "user_id"]
    utype = user_master.loc[user_master["user_id"] == uid, "user_fraud_type"].values[0]
    cancelled = post_booking_events.loc[idx, "is_cancelled"] == 1

    if utype == "cancellation_abuser" and cancelled:
        booking_label_table.loc[idx, "fraud_reason"] = "user has abnormally high cancellation rate"
    elif utype == "credit_bustout_user":
        booking_label_table.loc[idx, "fraud_reason"] = "credit bustout: high value international + high loss"
    elif utype == "new_synthetic_user":
        booking_label_table.loc[idx, "fraud_reason"] = "new user with risky infra + abnormal velocity"
    elif utype == "account_takeover":
        booking_label_table.loc[idx, "fraud_reason"] = "new device/ip + short lead time"
    elif utype == "ring_operator":
        booking_label_table.loc[idx, "fraud_reason"] = "shared infra across multiple fraud users"
    elif utype == "bot_booking":
        booking_label_table.loc[idx, "fraud_reason"] = "burst/automation pattern in activity"
    else:
        booking_label_table.loc[idx, "fraud_reason"] = "fraud user suspicious booking pattern"

# ============================================================
# SAVE FILES
# ============================================================
agency_master.to_csv(os.path.join(OUT_DIR, "agency_master.csv"), index=False)
user_master.to_csv(os.path.join(OUT_DIR, "user_master.csv"), index=False)
device_master.to_csv(os.path.join(OUT_DIR, "device_master.csv"), index=False)
ip_master.to_csv(os.path.join(OUT_DIR, "ip_master.csv"), index=False)
booking_fact.to_csv(os.path.join(OUT_DIR, "booking_fact.csv"), index=False)
post_booking_events.to_csv(os.path.join(OUT_DIR, "post_booking_events.csv"), index=False)
booking_label_table.to_csv(os.path.join(OUT_DIR, "booking_label_table.csv"), index=False)

with zipfile.ZipFile(ZIP_NAME, "w", zipfile.ZIP_DEFLATED) as z:
    for f in os.listdir(OUT_DIR):
        z.write(os.path.join(OUT_DIR, f), arcname=f)

print("✅ Dataset generated!")
print("Folder:", OUT_DIR)
print("Zip:", ZIP_NAME)

print("\nFraud user type distribution:")
print(user_master[user_master["user_fraud_label"]==1]["user_fraud_type"].value_counts())

print("\nFraud bookings:", booking_label_table["fraud_label"].sum())
print("Fraud bookings belong to fraud users only:",
      booking_fact.loc[booking_label_table["fraud_label"]==1, "user_id"].isin(fraud_user_ids).all())

print("\nInfra sanity check:")
print("Null ip_id:", booking_fact["ip_id"].isna().sum())
print("Null device_id:", booking_fact["device_id"].isna().sum())
print("Unique ip_id:", booking_fact["ip_id"].nunique())
print("Unique device_id:", booking_fact["device_id"].nunique())

✅ Dataset generated!
Folder: b2b_travel_userfraud_synth_v6_balanced_sticky
Zip: b2b_travel_userfraud_synth_v6_balanced_sticky.zip

Fraud user type distribution:
user_fraud_type
account_takeover       7
bot_booking            7
credit_bustout_user    7
ring_operator          7
new_synthetic_user     6
cancellation_abuser    6
Name: count, dtype: int64

Fraud bookings: 720
Fraud bookings belong to fraud users only: True

Infra sanity check:
Null ip_id: 0
Null device_id: 0
Unique ip_id: 481
Unique device_id: 432
