In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np

file_path = r'C:\Users\genes\documents\fraud.csv'

df = pd.read_csv(file_path)

# Data overview
print("Initial data overview:")
print(df.info())
print(df.head())

# Step 1: Filter fraud and non-fraud cases
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Check available fraud and non-fraud rows
available_fraud_rows = len(fraud_df)
available_non_fraud_rows = len(non_fraud_df)
print(f"Available fraud rows: {available_fraud_rows}")
print(f"Available non-fraud rows: {available_non_fraud_rows}")

# Step 2: Calculate how many fraud and non-fraud samples are needed
total_rows = 100000  # Set total number of rows desired
fraud_percentage = 0.30

# Number of fraud and non-fraud rows to select
fraud_rows = min(int(fraud_percentage * total_rows), available_fraud_rows)
non_fraud_rows = total_rows - fraud_rows

print(f"Sampling {fraud_rows} fraud rows and {non_fraud_rows} non-fraud rows.")

# Step 3: Sample the data
fraud_sample = fraud_df.sample(n=fraud_rows, random_state=42)
non_fraud_sample = non_fraud_df.sample(n=non_fraud_rows, random_state=42)

# Step 4: Combine the samples
sampled_df = pd.concat([fraud_sample, non_fraud_sample])

# Step 5: Shuffle the data to mix fraud and non-fraud rows
sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Optional: Check class distribution to ensure 30% fraud rate
print(sampled_df['isFraud'].value_counts(normalize=True))

# Step 6: Save the downsampled dataset
sampled_df.to_csv("sampled_fraud_dataset.csv", index=False)

# Summary of the final data
print(f"Final dataset has {len(sampled_df)} rows.")

# Step 6: Save the downsampled dataset in the same directory as the original file
output_file_path = r'C:\Users\genes\documents\sampled_fraud_dataset.csv'
sampled_df.to_csv(output_file_path, index=False)



Initial data overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720   