# Preprocessing & Feature Preparation

This notebook prepares the dataset for anomaly detection models.
The goal is to create a consistent, reusable preprocessing pipeline that can be shared
across all models (Isolation Forest, One-Class SVM, Autoencoder) and later reused in the API.

Key principles:
- No model training is performed here
- Fraud labels are not used during training data preparation
- Preprocessing is fit only on normal (non-fraud) transactions


In [4]:
import sys
import os

# Add project root to Python path
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import joblib

from src.config import FEATURES, TARGET, RANDOM_STATE

print("Config loaded successfully")
print("TARGET:", TARGET)
print("RANDOM_STATE:", RANDOM_STATE)


Config loaded successfully
TARGET: Class
RANDOM_STATE: 42


In [5]:
# Load processed (enriched) dataset
DATA_PATH = "../data/processed/transactions_enriched.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (284807, 43)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,home_lon,merchant_id,merchant_category,merchant_lat,merchant_lon,distance_from_home,hour,day_of_week,month,fraud_type
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,12.15973,MERCH_00564,fuel,47.378663,19.25986,643.501288,0,6,9,none
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,11.382369,MERCH_00806,grocery,49.823771,21.935214,1204.452437,0,6,9,none
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,-2.18168,MERCH_00320,jewelry,44.929588,-1.838575,931.062442,0,6,9,none
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-3.350826,MERCH_00912,grocery,44.161538,29.239495,2529.458613,0,6,9,none
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,26.659566,MERCH_00232,jewelry,53.411106,20.986141,1670.310422,0,6,9,none


In [6]:
# Inspect column data types
df.dtypes


Time                  float64
V1                    float64
V2                    float64
V3                    float64
V4                    float64
V5                    float64
V6                    float64
V7                    float64
V8                    float64
V9                    float64
V10                   float64
V11                   float64
V12                   float64
V13                   float64
V14                   float64
V15                   float64
V16                   float64
V17                   float64
V18                   float64
V19                   float64
V20                   float64
V21                   float64
V22                   float64
V23                   float64
V24                   float64
V25                   float64
V26                   float64
V27                   float64
V28                   float64
Amount                float64
Class                   int64
customer_id            object
home_lat              float64
home_lon  

In [8]:
# -----------------------------
# Feature Selection
# -----------------------------
# We include only numerical features that are available at inference time.
# Identifiers, categorical variables, and labels are excluded to:
# 1. Avoid data leakage
# 2. Ensure compatibility with unsupervised anomaly detection models
# 3. Keep preprocessing consistent with production inference

NUMERIC_FEATURES = [
    "Time",
    "Amount",
    *[f"V{i}" for i in range(1, 29)],
    "home_lat",
    "home_lon",
    "merchant_lat",
    "merchant_lon",
    "distance_from_home",
    "hour",
    "day_of_week",
    "month",
]

TARGET = "Class"

# Sanity check
missing_features = set(NUMERIC_FEATURES) - set(df.columns)
assert len(missing_features) == 0, f"Missing features: {missing_features}"

print("Number of features used:", len(NUMERIC_FEATURES))


Number of features used: 38


In [10]:
# -----------------------------
# Feature / Label Separation
# -----------------------------
# Fraud labels are NOT used during training.
# They are kept separately and used only for evaluation.

# Explicitly use the selected numeric features
FEATURES = NUMERIC_FEATURES

X = df[FEATURES]
y = df[TARGET]

print("Feature matrix shape:", X.shape)
print("Fraud rate:", y.mean())


Feature matrix shape: (284807, 38)
Fraud rate: 0.001727485630620034


In [12]:
# -----------------------------
# Train–Validation Split
# -----------------------------
# For anomaly detection:
# - Models are trained ONLY on normal (non-fraud) transactions
# - Fraud samples are kept ONLY for validation/evaluation

# Separate normal and fraud samples
X_normal = X[y == 0]
X_fraud = X[y == 1]

print("Normal transactions:", X_normal.shape[0])
print("Fraud transactions:", X_fraud.shape[0])

# Split normal data into train and validation
X_train_normal, X_val_normal = train_test_split(
    X_normal,
    test_size=0.2,
    random_state=RANDOM_STATE
)

# Validation set = remaining normal + all fraud samples
X_val = pd.concat([X_val_normal, X_fraud], axis=0)
y_val = pd.concat(
    [
        pd.Series(0, index=X_val_normal.index),
        pd.Series(1, index=X_fraud.index)
    ]
)

print("Training set (normal only):", X_train_normal.shape)
print("Validation set (mixed):", X_val.shape)
print("Validation fraud rate:", y_val.mean())


Normal transactions: 284315
Fraud transactions: 492
Training set (normal only): (227452, 38)
Validation set (mixed): (57355, 38)
Validation fraud rate: 0.008578153604742393


In [13]:
# -----------------------------
# Feature Scaling
# -----------------------------
# Anomaly detection models are sensitive to feature scale.
# The scaler is fit ONLY on normal training data to avoid data leakage.

scaler = StandardScaler()

# Fit on normal training data
X_train_scaled = scaler.fit_transform(X_train_normal)

# Transform validation data
X_val_scaled = scaler.transform(X_val)

print("Scaled training data shape:", X_train_scaled.shape)
print("Scaled validation data shape:", X_val_scaled.shape)


Scaled training data shape: (227452, 38)
Scaled validation data shape: (57355, 38)


In [15]:
# -----------------------------
# Save Preprocessing Artifacts
# -----------------------------
# These artifacts will be reused across:
# - Model training notebooks
# - Model comparison
# - FastAPI inference

import os

ARTIFACT_DIR = "../models"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# Save scaler
joblib.dump(scaler, f"{ARTIFACT_DIR}/scaler.pkl")

# Save processed datasets
np.save(f"{ARTIFACT_DIR}/X_train_scaled.npy", X_train_scaled)
np.save(f"{ARTIFACT_DIR}/X_val_scaled.npy", X_val_scaled)
np.save(f"{ARTIFACT_DIR}/y_val.npy", y_val.values)

print("Preprocessing artifacts saved successfully.")


Preprocessing artifacts saved successfully.
