In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support
)
import math
import os

# Step 1: Loading the dataset 

In [2]:
df = pd.read_csv("transactions.csv")

# STEP 2:  DATA PREPROCESSING


2.1 first we sort the dataset according to chronology i.e. we sort it based on time as anomalies are computed strictly from past data and prevent leak of future during the train test split 

In [3]:
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df = df.sort_values("timestamp").reset_index(drop=True)

2.2 now we drop rows and columns based on the following- 
 - if amount<0
 - if hour is not present in [0,23] 
 - day_of_the_week must be present between [0,6] if not we drop
 - if month is not present between [1,12]
 - duplicates
 - we drop any unique value identifier columns as they don't carry predictive pattern information

In [4]:
df = df[df["amount"] > 0]
df = df[(df["hour"] >= 0) & (df["hour"] <= 23)]
df = df[(df["day_of_week"] >= 0) & (df["day_of_week"] <= 6)]
df = df[(df["month"] >= 1) & (df["month"] <= 12)]
df = df.drop_duplicates()
y = df["is_fraud"].astype(int).reset_index(drop=True)
drop_cols = ["transaction_id","card_number","customer_id","merchant_id","timestamp","fraud_type"]
df = df.drop(columns=drop_cols)

In [5]:
df

Unnamed: 0,amount,merchant_category,merchant_lat,merchant_long,is_fraud,hour,day_of_week,month,distance_from_home
0,22779.74,jewelry,19.128865,72.902100,0,0,2,1,847.23
1,62609.57,electronics,19.009754,72.828282,0,0,2,1,1036.73
2,2155.20,gas,12.934345,77.606382,0,0,2,1,1566.57
3,344.87,grocery,28.705806,77.098609,0,0,2,1,1752.77
4,1271.03,restaurant,12.970154,77.615085,0,0,2,1,840.52
...,...,...,...,...,...,...,...,...,...
99995,13909.13,jewelry,28.652728,77.127274,0,23,6,3,1765.79
99996,23845.01,luxury_goods,13.041175,80.205535,0,23,6,3,1771.48
99997,1744.53,grocery,19.083755,72.912571,0,23,6,3,862.94
99998,1346.82,restaurant,19.053875,72.884754,0,23,6,3,5.72


2.3 Feature Engineering

- Since time is cricular i.e. 23hours is not far away from 0 hours so cyclical encoding using sine and cosine transformations is applied to preserve circular geometry rather than linear 

In [6]:
def cyclical(col, max_val):
    return (
        np.sin(2*np.pi*df[col]/max_val),
        np.cos(2*np.pi*df[col]/max_val)
    )

df["hour_sin"], df["hour_cos"] = cyclical("hour", 24)
df["day_sin"], df["day_cos"] = cyclical("day_of_week", 7)
df["month_sin"], df["month_cos"] = cyclical("month", 12)

df = df.drop(columns=["hour","day_of_week","month"])


In [7]:
df 

Unnamed: 0,amount,merchant_category,merchant_lat,merchant_long,is_fraud,distance_from_home,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,22779.74,jewelry,19.128865,72.902100,0,847.23,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
1,62609.57,electronics,19.009754,72.828282,0,1036.73,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
2,2155.20,gas,12.934345,77.606382,0,1566.57,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
3,344.87,grocery,28.705806,77.098609,0,1752.77,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
4,1271.03,restaurant,12.970154,77.615085,0,840.52,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,13909.13,jewelry,28.652728,77.127274,0,1765.79,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17
99996,23845.01,luxury_goods,13.041175,80.205535,0,1771.48,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17
99997,1744.53,grocery,19.083755,72.912571,0,862.94,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17
99998,1346.82,restaurant,19.053875,72.884754,0,5.72,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17


- We apply log1p to amount and distance because anomaly models are distance-based without compression, extreme values dominate the geometry and the model collapses into a magnitude detector.

In [8]:
df["amount_log"] = np.log1p(df["amount"])
df["distance_log"] = np.log1p(df["distance_from_home"])

df = df.drop(columns=["amount","distance_from_home"])

2.4 CATEGORICAL ENCODING
- We one-hot encode merchant categories

In [9]:

if "merchant_category" in df.columns:
    df = pd.get_dummies(df, columns=["merchant_category"], drop_first=True)
bool_cols = df.select_dtypes(include=["bool"]).columns
df[bool_cols] = df[bool_cols].astype(int)

- we drop any non-numeric columns because anomaly models operate purely in numeric geometry.

In [10]:
non_numeric = df.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric:
    print("Dropping non-numeric columns (unexpected):", non_numeric)
    df = df.drop(columns=non_numeric)

# STEP 3: Train Test Split
We perform a time-based 80% train, 20% test split because fraud detection models must learn from the past and predict the future. random splits would leak future fraud patterns and affect performance.

In [11]:
X_df = df.reset_index(drop=True)
y = y.reset_index(drop=True)

n = len(X_df)
split_idx = int(math.floor(n * 0.8))
X_train_df = X_df.iloc[:split_idx].reset_index(drop=True)
X_test_df = X_df.iloc[split_idx:].reset_index(drop=True)
y_train = y.iloc[:split_idx].reset_index(drop=True)
y_test = y.iloc[split_idx:].reset_index(drop=True)


# STEP 4 : Data Normalization 
All numerical features are scaled using RobustScaler, which normalizes data based on median and interquartile range to reduce sensitivity to extreme outliers common in fraud datasets.

In [12]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_df.values)
X_test_scaled = scaler.transform(X_test_df.values)

X_train = pd.DataFrame(X_train_scaled, columns=X_train_df.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test_df.columns)

# STEP 5: Training Isolation Forest
Isolation Forest detects fraud by isolating rare behavior using random feature splits, and we set contamination to the known fraud rate to calibrate the anomaly threshold.

In [13]:
SEED = 42
FRAUD_RATE = 0.02  
iso_clf = IsolationForest(
    n_estimators=200,
    contamination=FRAUD_RATE, 
    random_state=SEED,
    n_jobs=-1
)

iso_clf.fit(X_train) 

In [14]:
ocsvm = OneClassSVM(
    kernel="rbf",      # Non-linear boundary (required)
    nu=FRAUD_RATE,     # Expected anomaly fraction (~2%)
    gamma="scale"      # Safe default for high-dimensional data
)

ocsvm.fit(X_train)


# STEP 6: predictions and confusion matrix

Isolation forest metrics

In [15]:
pred_raw = iso_clf.predict(X_test)
y_pred = (pred_raw == -1).astype(int) 

anomaly_score = -iso_clf.decision_function(X_test)

print("\nConfusion Matrix (test):")
print(confusion_matrix(y_test, y_pred))

print("\nClassification report (test):")
print(classification_report(y_test, y_pred, digits=4))

prec, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)
print(f"\nPrecision: {prec:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")



Confusion Matrix (test):
[[19003   600]
 [   64   333]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9966    0.9694    0.9828     19603
           1     0.3569    0.8388    0.5008       397

    accuracy                         0.9668     20000
   macro avg     0.6768    0.9041    0.7418     20000
weighted avg     0.9839    0.9668    0.9733     20000


Precision: 0.3569, Recall: 0.8388, F1: 0.5008


One class SVM metrics

In [16]:
# Predictions
ocsvm_raw = ocsvm.predict(X_test)
ocsvm_pred = (ocsvm_raw == -1).astype(int)

# Anomaly scores (lower = more anomalous)
ocsvm_scores = -ocsvm.decision_function(X_test)

print("\nOne-Class SVM Confusion Matrix:")
print(confusion_matrix(y_test, ocsvm_pred))

print("\nOne-Class SVM Classification Report:")
print(classification_report(y_test, ocsvm_pred, digits=4))




One-Class SVM Confusion Matrix:
[[18715   888]
 [  223   174]]

One-Class SVM Classification Report:
              precision    recall  f1-score   support

           0     0.9882    0.9547    0.9712     19603
           1     0.1638    0.4383    0.2385       397

    accuracy                         0.9445     20000
   macro avg     0.5760    0.6965    0.6048     20000
weighted avg     0.9719    0.9445    0.9566     20000

