# Import Libraries

In [4]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load Dataset

In [5]:
df = pd.read_csv("../data/creditcard.csv")

# Feature Engineering

In [6]:
# Create Hour Feature from Time (s)
df["Hour"] = df["Time"] // 3600
df["Hour"] = df["Hour"].astype(int)

# Log Transform Amount
df["LogAmout"] = np.log1p(df["Amount"])

# Drop Raw Features
df = df.drop(columns = ["Time", "Amount"])

# Define Features and Target
Y = df["Class"]
X = df.drop(columns = ["Class"])

# Stratfied Train-Test Split
X_train, X_test, Y_train, Y_test, = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# Confirm Split
print(f"fraud rate in full data:", Y.mean())
print(f"fraud rate in train data:", Y_train.mean())
print(f"fraud rate in test data:", Y_test.mean())

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

fraud rate in full data: 0.001727485630620034
fraud rate in train data: 0.001729245759178389
fraud rate in test data: 0.0017204452090867595


# Save Scaled and Unscaled Datasets

In [9]:
os.makedirs("../data/processed", exist_ok=True)
joblib.dump((X_train, X_test, Y_train, Y_test), "../data/processed/split_data.pkl")
joblib.dump((X_train_scaled, X_test_scaled, Y_train, Y_test), "../data/processed/split_scaled_data.pkl")
joblib.dump(scaler, "../data/processed/scaler.pkl")

['../data/processed/scaler.pkl']