# Synthetic Financial Datasets For Fraud Detection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

## Loading the data

In [2]:
# Read the CSV file 


file_path = "PS_20174392719_1491204439457_log.csv"

try:
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Basic information about the dataset
    print("Dataset shape:", df.shape)
    print("\nColumn names:")
    print(df.columns.tolist())
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(df.head())
    
    # Basic info about the dataset
    print("\nDataset info:")
    df.info()
    
    # Check for missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())
    
    # Basic statistics for numerical columns
    print("\nBasic statistics:")
    display(df.describe())
    
    # Show data types
    print("\nData types:")
    print(df.dtypes)
    
except FileNotFoundError:
    print(f"File '{file_path}' not found. Please check the file path and name.")
except pd.errors.EmptyDataError:
    print("The file is empty.")
except pd.errors.ParserError:
    print("Error parsing the file. Trying with different parameters...")
    # Try with different encoding or delimiter
    df = pd.read_csv(file_path, encoding='latin1', sep=';')
    print("Successfully loaded with alternative parameters.")
    display(df.head())
except Exception as e:
    print(f"An error occurred: {e}")

Dataset shape: (6362620, 11)

Column names:
['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

First 5 rows:


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB

Missing values per column:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

Basic statistics:


Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0



Data types:
step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object


In [3]:
## Checking the dataset balance
fraud_counts = df["isFraud"].value_counts()
print(fraud_counts)

isFraud
0    6354407
1       8213
Name: count, dtype: int64


## Split the dataset

This code implements a stratified 13-way split of a highly imbalanced fraud detection dataset, ensuring that each subset maintains the same class distribution as the original dataset.

In [4]:
# Create stratified 13-fold split
skf = StratifiedKFold(n_splits=13, shuffle=True, random_state=42)

# Get features and target
X = df.drop('isFraud', axis=1)
y = df['isFraud']

# Split and save each fold
fold_stats = []
for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    # Use test indices as our subset
    subset = df.iloc[test_idx].copy()
    
    # IMPORTANT: Shuffle each subset to randomly distribute fraud cases
    subset = subset.sample(frac=1, random_state=42+i).reset_index(drop=True)
    
    # Save to CSV
    filename = f'df_fraud_split_{i+1}.csv'
    subset.to_csv(filename, index=False, encoding='utf-8')
    
    # Collect statistics
    total_rows = len(subset)
    fraud_0 = (subset['isFraud'] == 0).sum()
    fraud_1 = (subset['isFraud'] == 1).sum()
    
    fold_stats.append({
        'Subset': i+1,
        'Total Rows': total_rows,
        'isFraud = 0': fraud_0,
        'isFraud = 1': fraud_1
    })

# Create and display verification table
verification_df = pd.DataFrame(fold_stats)
print("Verification Table:")
print(verification_df.to_string(index=False))

# Verify that each subset has the required number of positives
for stats in fold_stats:
    positives = stats['isFraud = 1']
    assert positives in [631, 632], f"Subset {stats['Subset']} has {positives} positives, expected 631 or 632"

print("\nAll assertions passed!")

# Print summary statistics
total_fraud_used = sum(stats['isFraud = 1'] for stats in fold_stats)
print(f"\nSummary:")
print(f"Total fraud cases in original dataset: 8,213")
print(f"Total fraud cases distributed across subsets: {total_fraud_used}")
print(f"Average fraud cases per subset: {total_fraud_used / 13:.1f}")

Verification Table:
 Subset  Total Rows  isFraud = 0  isFraud = 1
      1      489433       488801          632
      2      489433       488801          632
      3      489433       488801          632
      4      489433       488801          632
      5      489432       488801          631
      6      489432       488801          631
      7      489432       488801          631
      8      489432       488800          632
      9      489432       488800          632
     10      489432       488800          632
     11      489432       488800          632
     12      489432       488800          632
     13      489432       488800          632

All assertions passed!

Summary:
Total fraud cases in original dataset: 8,213
Total fraud cases distributed across subsets: 8213
Average fraud cases per subset: 631.8
