# Generate subsample datasets

In [1]:
import pandas as pd
import numpy as np

def subsample_paysim_data(
    df,
    total_size=10000,
    fraud_ratio=0.1,
    transaction_types=None,
    random_state=42
):
    """
    Subsample the PaySim dataset.

    Parameters:
    - df (DataFrame): Full PaySim dataset.
    - total_size (int): Total size of the subsample.
    - fraud_ratio (float): Desired ratio of fraud cases in the sample (between 0 and 1).
    - transaction_types (list): List of transaction types to include (e.g., ['TRANSFER', 'CASH_OUT']).
    - random_state (int): For reproducibility.

    Returns:
    - DataFrame: Subsampled PaySim data.
    """
    
    np.random.seed(random_state)

    # Filter by transaction types
    if transaction_types is not None:
        df = df[df['type'].isin(transaction_types)]
    
    # Separate fraud and non-fraud
    fraud_df = df[df['isFraud'] == 1]
    nonfraud_df = df[df['isFraud'] == 0]

    # Determine number of fraud and non-fraud samples
    n_fraud = int(total_size * fraud_ratio)
    n_nonfraud = total_size - n_fraud

    # Safety checks
    n_fraud = min(n_fraud, len(fraud_df))
    n_nonfraud = min(n_nonfraud, len(nonfraud_df))

    # Sample each class
    fraud_sample = fraud_df.sample(n=n_fraud, random_state=random_state)
    nonfraud_sample = nonfraud_df.sample(n=n_nonfraud, random_state=random_state)

    # Combine and shuffle
    sampled_df = pd.concat([fraud_sample, nonfraud_sample]).sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Always remove isFlaggedFraud as it's useless and confusing
    sampled_df = sampled_df.drop('isFlaggedFraud', axis=1)
    return sampled_df

In [3]:
import os

dataset_path = "~/data/mta/fraud/paysim_original_dataset.csv"
dataset_path = os.path.expanduser(dataset_path)

print(dataset_path)

/Users/shaypalachy/data/mta/fraud/paysim_original_dataset.csv


In [4]:
# Load the dataset
df = pd.read_csv(dataset_path)

In [5]:
# Get a 10k sample, 20% fraud, only TRANSFER and CASH_OUT transactions
sample_1_df = subsample_paysim_data(
    df,
    total_size=10000,
    fraud_ratio=0.2,
    transaction_types=["TRANSFER", "CASH_OUT"]
)

print(sample_1_df['isFraud'].value_counts())
print(sample_1_df['type'].value_counts())

0    8000
1    2000
Name: isFraud, dtype: int64
CASH_OUT    7526
TRANSFER    2474
Name: type, dtype: int64


In [8]:
# Get a 40k sample, 6% fraud, only TRANSFER and CASH_OUT transactions
sample_2_df = subsample_paysim_data(
    df,
    total_size=40000,
    fraud_ratio=0.0614481,
    transaction_types=["TRANSFER", "CASH_OUT"]
)

print(sample_2_df['isFraud'].value_counts())
print(sample_2_df['type'].value_counts())

0    37543
1     2457
Name: isFraud, dtype: int64
CASH_OUT    31630
TRANSFER     8370
Name: type, dtype: int64


In [10]:
sample_2_df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [13]:
sample_2_df.to_csv("imbl_fraud.csv", index=False)

## Train test Split

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load data
df = pd.read_csv("../data/imbl_fraud.csv")

In [3]:
# Separate features and target
X = df.drop(columns=["isFraud"])
y = df["isFraud"]

In [4]:
# Perform a stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3,       # 30% test set
    stratify=y,          # preserve class ratio
    random_state=42      # reproducibility
)

In [5]:
# Optionally join features and labels for inspection
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [6]:
# Confirm the stratification
print("Training set fraud rate:", y_train.mean())
print("Test set fraud rate:", y_test.mean())

Training set fraud rate: 0.06142857142857143
Test set fraud rate: 0.06141666666666667


In [7]:
# Save to CSV
train_data.to_csv("../data/train_fraud.csv", index=False)
test_data.to_csv("../data/test_fraud.csv", index=False)

print("✅ Files saved: train_fraud.csv and test_fraud.csv")

✅ Files saved: train_fraud.csv and test_fraud.csv
