In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import networkx as nx


In [30]:
data=pd.read_csv("SAML-D.csv")

In [31]:
def split_by_component_two_phase(
    df,
    sender_col="Sender_account",
    receiver_col="Receiver_account",
    label_col="is_laundering",
    train_frac=0.8,
    random_state=None,
):
    rng = np.random.default_rng(random_state)

    # 1️⃣ Build graph
    G = nx.Graph()
    G.add_edges_from(df[[sender_col, receiver_col]].itertuples(index=False, name=None))
    components = list(nx.connected_components(G))

    # 2️⃣ Component stats
    comp_stats = []
    for cid, comp in enumerate(components):
        mask = df[sender_col].isin(comp) & df[receiver_col].isin(comp)
        sub = df.loc[mask]
        n = len(sub)
        pos = sub[label_col].sum()
        comp_stats.append((cid, n, pos, comp))
    comp_df = pd.DataFrame(comp_stats, columns=["comp_id", "n_tx", "n_pos", "accounts"])
    comp_df["pos_rate"] = comp_df["n_pos"] / comp_df["n_tx"].replace(0, np.nan)

    total_tx = comp_df["n_tx"].sum()
    total_pos = comp_df["n_pos"].sum()

    target_tx_train = train_frac * total_tx
    target_pos_train = train_frac * total_pos
    # 3️⃣ Sort components by descending positive rate
    comp_df = comp_df.sample(frac=1, random_state=random_state) \
                 .sort_values(["pos_rate", "n_tx"], ascending=[False, False]) \
                 .reset_index(drop=True)

    # 4️⃣ Phase 1: allocate high-positive components to meet expected positive counts
    train_comps, test_comps = [], []
    train_tx = train_pos = 0
    test_tx = test_pos = 0

    for _, row in comp_df.iterrows():
        if row["n_pos"] > 0:
            new_train_pos = train_pos + row["n_pos"]
            new_test_pos = test_pos + row["n_pos"]
            if new_train_pos < target_pos_train:
                train_comps.append(row["comp_id"])
                train_tx += row["n_tx"]
                train_pos += row["n_pos"]
            elif new_test_pos < total_pos - target_pos_train:
                test_comps.append(row["comp_id"])
                test_tx += row["n_tx"]
                test_pos += row["n_pos"]
            elif abs(new_test_pos - (total_pos - target_pos_train)) >= abs(new_train_pos - target_pos_train):
                train_comps.append(row["comp_id"])
                train_tx += row["n_tx"]
                train_pos += row["n_pos"]
            else:
                test_comps.append(row["comp_id"])
                test_tx += row["n_tx"]
                test_pos += row["n_pos"]
        else:
            if train_tx < target_tx_train:
                train_comps.append(row["comp_id"])
                train_tx += row["n_tx"]
            else:
                test_comps.append(row["comp_id"])
                test_tx += row["n_tx"]

    # 6️⃣ Extract transactions
    comp_to_acc = {r["comp_id"]: r["accounts"] for _, r in comp_df.iterrows()}
    train_accounts = set().union(*(comp_to_acc[c] for c in train_comps))
    test_accounts = set().union(*(comp_to_acc[c] for c in test_comps))

    train_df = df[df[sender_col].isin(train_accounts) & df[receiver_col].isin(train_accounts)]
    test_df  = df[df[sender_col].isin(test_accounts)  & df[receiver_col].isin(test_accounts)]

    info = {
        "n_train": len(train_df),
        "n_test": len(test_df),
        "train_pos_rate": train_df[label_col].mean(),
        "test_pos_rate": test_df[label_col].mean(),
        "target_pos_rate": total_pos / total_tx,
        "train_frac_actual": len(train_df) / (len(train_df) + len(test_df)),
        "test_pos":  test_df[label_col].sum(),
        "train_pos": train_df[label_col].sum()
    }

    return train_df, test_df, info, comp_df


In [32]:
train_df, test_df, info, comp_df = split_by_component_two_phase(
    data,
    sender_col="Sender_account",
    receiver_col="Receiver_account",
    label_col="Is_laundering",
    train_frac=0.8,
    random_state=42
)

print(info)

{'n_train': 7604163, 'n_test': 1900689, 'train_pos_rate': np.float64(0.001038641596714852), 'test_pos_rate': np.float64(0.001039096874870113), 'target_pos_rate': np.float64(0.0010387326388669703), 'train_frac_actual': 0.8000296059317915, 'test_pos': np.int64(1975), 'train_pos': np.int64(7898)}


In [33]:
val_df, test_df, info, comp_df = split_by_component_two_phase(
    test_df,
    sender_col="Sender_account",
    receiver_col="Receiver_account",
    label_col="Is_laundering",
    train_frac=0.5,
    random_state=43
)


In [35]:
print(info)

{'n_train': 950572, 'n_test': 950117, 'train_pos_rate': np.float64(0.0010393741873314173), 'test_pos_rate': np.float64(0.0010388194296070904), 'target_pos_rate': np.float64(0.001039096874870113), 'train_frac_actual': 0.5001196934374851, 'test_pos': np.int64(987), 'train_pos': np.int64(988)}


In [34]:
train_df.to_csv("train.csv")
val_df.to_csv("val.csv")
test_df.to_csv("test.csv")