In [None]:
# ==========================================================
# Fraud Dataset Generator Engine (UPI Payments Simulation)
# ==========================================================

import numpy as np
import pandas as pd
import random
import uuid
from datetime import datetime, timedelta
import networkx as nx
from pathlib import Path

# -----------------------------
# Config
# -----------------------------
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# Adjustable params
TARGET_ROWS = 8000          # desired final dataset size
BASE_CSV = "/content/upi_features.csv"  # path in Colab
OUT_CSV = "upi_synthetic_enhanced.csv"
BASE_LABEL_NOISE = 0.01     # fraction of labels to flip
FRAUD_BASE_RATE = 0.02      # baseline fraud prevalence
INJECT_ARC_INTENSITY = 1.2  # controls injected archetypes

# -----------------------------
# Utilities
# -----------------------------
def make_uuid():
    return str(uuid.uuid4())

def rand_ip(rng=np.random):
    return ".".join(str(rng.randint(1, 255)) for _ in range(4))

def rand_timestamp(start, end, rng=np.random):
    s = int((end - start).total_seconds())
    return start + timedelta(seconds=int(rng.randint(0, s)))

def make_upi_id(prefix: str, rng=np.random):
    return f"{prefix}{rng.randint(10_000, 99_999_999)}@upi"

def jitter_numeric(x, scale=0.01, rng=np.random):
    return float(np.round(x * (1 + rng.normal(0, scale)) +
                          rng.normal(0, scale * abs(x)), 6))

def sample_with_replacement(df, n):
    idx = np.random.choice(df.index, size=n, replace=True)
    return df.loc[idx].reset_index(drop=True)

def mutate_upi(value, rng=np.random, prefix="user"):
    if isinstance(value, str) and "@" in value:
        return make_upi_id(prefix, rng)
    else:
        return make_upi_id(prefix, rng)

def safe_mutate_merchant(v, rng):
    if pd.isna(v) or not isinstance(v, str) or v.strip() == "":
        return mutate_upi("", rng, prefix="merchant")
    if rng.rand() < 0.4:
        return mutate_upi(v.replace("@upi", ""), rng, prefix="merchant")
    return v

# -----------------------------
# Load seed dataset
# -----------------------------
def load_seed(path):
    df = pd.read_csv(path)
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except Exception:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce").fillna(pd.Timestamp.now())
    return df

# -----------------------------
# Augment seed
# -----------------------------
def augment_seed(df_seed, target_n, rng=np.random):
    seed_n = len(df_seed)
    if seed_n >= target_n:
        df = df_seed.sample(n=target_n, random_state=SEED).reset_index(drop=True)
    else:
        df = sample_with_replacement(df_seed, n=target_n)

    # jitter numeric cols
    for col in ["amount","balance_before","balance_after",
                "dest_balance_before","dest_balance_after",
                "merchant_risk_score","risk_score"]:
        if col in df.columns:
            df[col] = df[col].apply(lambda v: jitter_numeric(max(0.0, float(v)), scale=0.2, rng=rng))

    # mutate ids
    df["payer_id"] = df["payer_id"].apply(lambda v: mutate_upi(v, rng) if rng.rand()<0.35 else v)
    df["payee_id"] = df["payee_id"].apply(lambda v: mutate_upi(v, rng) if rng.rand()<0.4 else v)
    df["merchant_id"] = df["merchant_id"].apply(lambda v: safe_mutate_merchant(v, rng))
    df["device_id"] = df["device_id"].apply(lambda v: make_uuid() if rng.rand()<0.25 else v)
    df["ip_address"] = df["ip_address"].apply(lambda v: rand_ip(rng) if rng.rand()<0.3 else v)

    # jitter booleans
    for b in ["is_new_device","kyc_verified","is_weekend",
              "is_large_txn","is_night_txn","is_device_shared","is_ip_shared"]:
        if b in df.columns:
            df[b] = df[b].apply(lambda v: int(bool(v)) if rng.rand()>0.03 else int(not bool(v)))

    # jitter counts
    for cnt in ["num_prev_txns_24h","num_prev_txns_7d","num_users_per_device",
                "num_users_per_ip","device_age_days","customer_tenure_days",
                "settlement_time_seconds"]:
        if cnt in df.columns:
            df[cnt] = df[cnt].apply(lambda v: max(0, int(abs(v + rng.normal(0, max(1,0.2*abs(v)+1))))))

    # timestamps
    min_ts, max_ts = df_seed["timestamp"].min(), df_seed["timestamp"].max()
    df["timestamp"] = df["timestamp"].apply(lambda t: rand_timestamp(min_ts,max_ts,rng) if rng.rand()<0.5 else t)
    df["txn_hour"] = pd.to_datetime(df["timestamp"]).dt.hour
    return df.reset_index(drop=True)

# -----------------------------
# Fraud scoring
# -----------------------------
def assign_fraud_scores(df, rng=np.random, intensity=1.0):
    def score_row(r):
        s = 0.0
        s += 2.0*r["merchant_risk_score"] + 2.0*r["risk_score"]
        if r["balance_before"]>0:
            ratio = r["amount"]/(r["balance_before"]+1e-9)
            if ratio>0.25: s += 1.0*(ratio-0.25)
            if ratio>1.0: s += 2.0
        if r["is_new_device"] and r["customer_tenure_days"]<60: s+=1.5
        if r["num_prev_txns_24h"]>=5: s += 1.0+0.2*(r["num_prev_txns_24h"]-5)
        if r["settlement_time_seconds"]<3: s+=0.8
        if r["settlement_time_seconds"]>3600: s+=0.5
        if r["txn_type"]=="P2P": s+=0.2
        if r["browser"]=="android_custom": s+=0.4
        if r.get("is_night_txn",False): s+=0.2
        if r.get("is_device_shared",False): s+=0.7
        if r.get("is_ip_shared",False): s+=0.5
        s += rng.normal(0,0.5)
        return max(0.0,s)*float(intensity)
    df["fraud_score_raw"]=df.apply(score_row,axis=1)
    df["fraud_score_prob"]=1.0/(1.0+np.exp(-(df["fraud_score_raw"]-2.0)))
    return df

# -----------------------------
# Feature expansion
# -----------------------------
def expand_features(df, rng=np.random):
    df["timestamp"]=pd.to_datetime(df["timestamp"])
    df["txn_day"]=df["timestamp"].dt.day
    df["txn_week"]=df["timestamp"].dt.isocalendar().week.astype(int)
    df["txn_month"]=df["timestamp"].dt.month
    df["txn_minute"]=df["timestamp"].dt.minute
    df["hour_bucket"]=pd.cut(df["txn_hour"],bins=[-1,6,12,18,23],
                             labels=["night","morning","afternoon","evening"])

    df["amount_log"]=np.log1p(df["amount"].clip(lower=0.01))
    df["amount_to_balance"]=(df["amount"]/(df["balance_before"]+1e-9)).round(6)
    df["balance_change"]=(df["balance_before"]-df["balance_after"]).round(6)

    payer_grp=df.groupby("payer_id")["amount"]
    df["payer_amount_mean"]=df["payer_id"].map(payer_grp.mean()).fillna(df["amount"].mean())
    df["payer_amount_std"]=df["payer_id"].map(payer_grp.std()).fillna(0.0)
    df["payer_txn_count"]=df.groupby("payer_id")["amount"].transform("count")

    if "merchant_id" in df.columns:
        mer_grp=df.groupby("merchant_id")["amount"]
        df["merchant_amount_mean"]=df["merchant_id"].map(mer_grp.mean()).fillna(df["amount"].mean())
        df["merchant_txn_count"]=df.groupby("merchant_id")["amount"].transform("count")
        df["merchant_popularity_rank"]=df["merchant_id"].map(df["merchant_id"].value_counts()).fillna(0)

    df["distinct_devices_per_payer"]=df.groupby("payer_id")["device_id"].transform(lambda x:x.nunique())
    df["distinct_ips_per_payer"]=df.groupby("payer_id")["ip_address"].transform(lambda x:x.nunique())
    df["merchant_cat_variety"]=df.groupby("payer_id")["merchant_category"].transform(lambda x:x.nunique())

    df["amt_vs_payer_z"]=(df["amount"]-df["payer_amount_mean"])/(df["payer_amount_std"].replace(0,1)+1e-9)
    amt_std=df["amount"].std() if df["amount"].std()>0 else 1.0
    df["amt_vs_merchant_z"]=(df["amount"]-df["merchant_amount_mean"].fillna(df["amount"].mean()))/(amt_std+1e-9)

    try:
        g=nx.DiGraph()
        edges=df[["payer_id","payee_id","amount"]].dropna()
        edges_sample=edges.sample(n=min(len(edges),4000),random_state=SEED)
        for _,e in edges_sample.iterrows():
            g.add_edge(str(e["payer_id"]),str(e["payee_id"]),weight=float(e["amount"]))
        pr=nx.pagerank(g,alpha=0.85)
        df["payer_pagerank"]=df["payer_id"].map(lambda x:pr.get(str(x),0.0))
        df["payee_pagerank"]=df["payee_id"].map(lambda x:pr.get(str(x),0.0))
        df["payer_degree"]=df["payer_id"].map(lambda x:g.degree(str(x)) if g.has_node(str(x)) else 0)
        df["payee_degree"]=df["payee_id"].map(lambda x:g.degree(str(x)) if g.has_node(str(x)) else 0)
    except Exception:
        df["payer_pagerank"]=0.0
        df["payee_pagerank"]=0.0
        df["payer_degree"]=0
        df["payee_degree"]=0

    df["decoy_random_1"]=np.random.randn(len(df))
    df["decoy_random_2"]=np.random.randint(0,50,size=len(df))
    df["decoy_hash_mod"]=df["payer_id"].apply(lambda x:hash(str(x))%7)

    # Safe fill
    for c in df.select_dtypes(include=["category","object"]).columns:
        df[c]=df[c].astype(str).replace("nan","unknown").fillna("unknown")
    for c in df.select_dtypes(include=[np.number]).columns:
        df[c]=df[c].fillna(0)

    return df

# -----------------------------
# Label finalization
# -----------------------------
def finalize_labels(df, base_rate=FRAUD_BASE_RATE, label_noise=BASE_LABEL_NOISE, rng=np.random):
    eps=1e-9
    combined=(0.6*df["fraud_score_prob"].fillna(0)+
              0.2*(df["amt_vs_payer_z"].fillna(0).clip(-5,5)/5.0)+
              0.1*df["payer_pagerank"].fillna(0)+
              0.2*df["is_device_shared"].fillna(0)+
              0.2*df.get("is_ip_shared",0).fillna(0))
    comb_min,comb_max=combined.min(),combined.max()
    combined_norm=(combined-comb_min)/(comb_max-comb_min+eps)
    final_prob=0.85*combined_norm+0.15*base_rate
    df["isFraud_gen"]=final_prob.apply(lambda p:int(rng.rand()<p))
    df["isFraud"]=df["isFraud_gen"]

    cur=df["isFraud"].mean()
    if cur<base_rate*0.6:
        need=int((base_rate-cur)*len(df))
        if need>0:
            flip_idx=rng.choice(df.index,size=max(1,need),replace=False)
            df.loc[flip_idx,"isFraud"]=1

    df["isFlaggedFraud"]=((df["fraud_score_prob"]>0.6)|
                          (df["merchant_risk_score"]>0.85)|
                          ((df["is_new_device"]==1)&
                           (df["amount"]>df["payer_amount_mean"]*5))).astype(int)

    if label_noise>0:
        flip_n=int(label_noise*len(df))
        flip_idx=rng.choice(df.index,size=flip_n,replace=False)
        df.loc[flip_idx,"isFraud"]=1-df.loc[flip_idx,"isFraud"]

    df["label"]=df.apply(lambda r:"fraud" if r["isFraud"]==1 else "legit",axis=1)
    return df

# -----------------------------
# Final cleanup
# -----------------------------
def finalize_dataframe(df):
    bool_cols=["is_new_device","is_weekend","is_large_txn",
               "is_night_txn","is_device_shared","is_ip_shared",
               "kyc_verified","isFraud","isFlaggedFraud"]
    for c in bool_cols:
        if c in df.columns:
            df[c]=df[c].replace({True:1,False:0,"TRUE":1,"True":1,"true":1,
                                 "FALSE":0,"False":0,"false":0}).fillna(0).astype(int)
    return df

# -----------------------------
# Main pipeline
# -----------------------------
def generate_from_seed(base_csv=BASE_CSV,target_rows=TARGET_ROWS,
                       label_noise=BASE_LABEL_NOISE,fraud_base_rate=FRAUD_BASE_RATE,
                       inject_intensity=INJECT_ARC_INTENSITY,out_path=OUT_CSV,seed=SEED):
    rng=np.random.RandomState(seed)
    df_seed=load_seed(base_csv)
    print("Seed rows:",len(df_seed))

    if target_rows<len(df_seed):
        target_rows=len(df_seed)

    df_aug=augment_seed(df_seed,target_rows,rng)
    print("After augmentation rows:",len(df_aug))

    df_scored=assign_fraud_scores(df_aug,rng,intensity=inject_intensity)
    df_expanded=expand_features(df_scored,rng)
    df_labeled=finalize_labels(df_expanded,base_rate=fraud_base_rate,label_noise=label_noise,rng=rng)
    df_final=finalize_dataframe(df_labeled)

    if len(df_final)>target_rows:
        df_final=df_final.sample(n=target_rows,random_state=seed).reset_index(drop=True)
    elif len(df_final)<target_rows:
        more=sample_with_replacement(df_final,target_rows-len(df_final))
        df_final=pd.concat([df_final,more],ignore_index=True).reset_index(drop=True)

    df_final.to_csv(out_path,index=False)
    print("Final rows:",len(df_final))
    print("Final fraud rate:",df_final["isFraud"].mean())
    return df_final

# -----------------------------
# Run in Colab
# -----------------------------
if __name__=="__main__":
    from google.colab import files
    uploaded=files.upload()  # Upload your upi_features.csv
    out=generate_from_seed(base_csv=BASE_CSV,target_rows=8000,
                           label_noise=0.01,fraud_base_rate=0.02,
                           inject_intensity=1.2,out_path=OUT_CSV,seed=SEED)
    print(out.columns.tolist())
    print(out[["isFraud","label"]].value_counts().head(20))
    out[out["isFraud"]==1].head(10)


Saving upi_features.csv to upi_features (2).csv
Seed rows: 2000
After augmentation rows: 8000
Final rows: 8000
Final fraud rate: 0.328875
['timestamp', 'txn_type', 'type', 'channel', 'status', 'payer_id', 'payee_id', 'merchant_id', 'merchant_category', 'merchant_risk_score', 'amount', 'balance_before', 'balance_after', 'dest_balance_before', 'dest_balance_after', 'device_id', 'device_age_days', 'is_new_device', 'ip_address', 'num_prev_txns_24h', 'num_prev_txns_7d', 'settlement_time_seconds', 'risk_score', 'isFraud', 'isFlaggedFraud', 'label', 'app_version', 'browser', 'customer_tenure_days', 'kyc_verified', 'notes', 'txn_hour', 'is_weekend', 'balance_drop_ratio', 'is_large_txn', 'is_night_txn', 'num_users_per_device', 'is_device_shared', 'num_users_per_ip', 'is_ip_shared', 'device_age_bucket', 'fraud_score_raw', 'fraud_score_prob', 'txn_day', 'txn_week', 'txn_month', 'txn_minute', 'hour_bucket', 'amount_log', 'amount_to_balance', 'balance_change', 'payer_amount_mean', 'payer_amount_std