In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/raw/onlinefraud.csv')

transaction_types_to_remove = ['CASH_IN', 'DEBIT', 'PAYMENT']
df = df[~df['type'].isin(transaction_types_to_remove)]

df['balanceDiffOrig'] = df['oldbalanceOrg'] - df['newbalanceOrig']

columns_to_drop = ['nameOrig', 'nameDest', 'oldbalanceDest', 'newbalanceOrig', 'newbalanceDest']
df = df.drop(columns=columns_to_drop)

label_encoder = LabelEncoder()
df['type_encoded'] = label_encoder.fit_transform(df['type'])

# Step 6: Split the dataset into features (X) and target (y)
X = df.drop('isFraud', axis=1)  # All columns except 'isFraud'
y = df['isFraud']  # The target variable

# Step 7: Split the dataset into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 8: Save the original training and testing data to separate Excel files
df_train = pd.concat([X_train, y_train], axis=1)  # Combine X_train and y_train
df_test = pd.concat([X_test, y_test], axis=1)  # Combine X_test and y_test



In [9]:
# Saving as compressed .zip files using pandas (Excel format inside .zip)
df_train.to_csv('../data/processed/train_data.csv.gz', index=False, compression='gzip')
df_test.to_csv('../data/processed/test_data.csv.gz', index=False, compression='gzip')

# Check a sample of the saved data to confirm everything worked
print("Data saved successfully!")
print(f"Train Data: {df_train.shape}")
print(f"Test Data: {df_test.shape}")

Data saved successfully!
Train Data: (1939286, 8)
Test Data: (831123, 8)
