In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler



In [None]:
df = pd.read_csv('paysim.csv')  # adjust path
print(f"Original dataset: {df.shape[0]} rows")


Original dataset: 6362620 rows


In [None]:
fraud_df = df[df['isFraud'] == 1]
nonfraud_df = df[df['isFraud'] == 0].sample(50000 - len(fraud_df), random_state=42)
df_subset = pd.concat([fraud_df, nonfraud_df], ignore_index=True)
df_subset = df_subset.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Subset dataset: {df_subset.shape[0]} rows, fraud proportion: {df_subset['isFraud'].mean():.4f}")

Subset dataset: 50000 rows, fraud proportion: 0.1643


In [None]:
df_subset['hour'] = df_subset['step'] % 24
df_subset['day'] = df_subset['step'] // 24
df_subset['day_of_month'] = (df_subset['day'] % 31) + 1


In [None]:
def add_fraud_rings_paysim(df: pd.DataFrame, n_rings: int = 5,
                            min_ring_len: int = 3, max_ring_len: int = 6,
                            amount_range=(1000, 10000)):
    fake_tx = []

    # Starting IDs
    existing_cust_ids = df['nameOrig'].str.extract(r'C(\d+)').dropna().astype(int)
    next_cust_id = existing_cust_ids.max()[0] + 1 if not existing_cust_ids.empty else 100000000
    start_step = df['step'].max() + 1

    for _ in range(n_rings):
        ring_len = np.random.randint(min_ring_len, max_ring_len + 1)
        ring_accounts = [f"C{int(next_cust_id + i):09d}" for i in range(ring_len)]
        next_cust_id += ring_len

        steps = start_step + np.cumsum(np.random.randint(1, 5, size=ring_len))
        amount = np.random.uniform(*amount_range)

        for i in range(ring_len):
            orig = ring_accounts[i]
            dest = ring_accounts[(i + 1) % ring_len]  # close the ring
            tx_type = np.random.choice(['TRANSFER','CASH_OUT'])
            amount *= np.random.uniform(0.8, 1.0)  # ≤20% deduction

            oldbalanceOrg = amount + np.random.uniform(1000, 5000)
            newbalanceOrig = oldbalanceOrg - amount
            oldbalanceDest = np.random.uniform(0, 5000)
            newbalanceDest = oldbalanceDest + amount
            step = steps[i]
            hour = step % 24
            day = (step // 24) % 31
            day_of_month = day + 1

            fake_tx.append({
                'nameOrig': orig,
                'nameDest': dest,
                'amount': amount,
                'type': tx_type,
                'oldbalanceOrg': oldbalanceOrg,
                'newbalanceOrig': newbalanceOrig,
                'oldbalanceDest': oldbalanceDest,
                'newbalanceDest': newbalanceDest,
                'step': step,
                'hour': hour,
                'day': day,
                'day_of_month': day_of_month,
                'isFraud': 1,
                'isFlaggedFraud': 1
            })

        start_step = steps[-1] + 1  # chronological order across rings

    fake_df = pd.DataFrame(fake_tx)
    df_augmented = pd.concat([df, fake_df], ignore_index=True)
    return df_augmented

In [None]:
df_augmented = add_fraud_rings_paysim(df_subset, n_rings=5)
print(f"Augmented dataset: {df_augmented.shape[0]} rows")

Augmented dataset: 50024 rows


In [None]:
print(df_augmented.head())

   step      type     amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0    46   CASH_IN   82812.61  C2045765496       94057.99       176870.60   
1   266   PAYMENT    3123.24  C1551703422       50036.00        46912.76   
2    17  TRANSFER  305627.00  C1711670940      305627.00            0.00   
3   137   PAYMENT   51680.70   C151179513      243532.79       191852.09   
4   359   CASH_IN   30096.99  C1061023977     7599823.47      7629920.46   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  hour  \
0  C1616574477       591195.97       508383.36        0               0    22   
1   M616455084            0.00            0.00        0               0     2   
2   C385061535            0.00            0.00        1               0    17   
3  M1730703519            0.00            0.00        0               0    17   
4  C1246610976       106350.59        76253.60        0               0    23   

   day  day_of_month  
0    1             2  
1   11    

In [None]:
df_augmented.to_csv("df_augmented_paysim.csv", index=False)
