In [93]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support
)
import math
import os

step1: load the dataset 

In [81]:
df = pd.read_csv("transactions.csv")

# STEP 1:  DATA PREPROCESSING


1.1 first we sort the dataset according to chronology i.e. we sort it based on time as anomalies are computed strictly from past data and prevent leak of future during the train test split 

In [82]:
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df = df.sort_values("timestamp").reset_index(drop=True)

1.2 now we drop rows and columns based on the following- 
 - if amount<0
 - if hour is not present in [0,23] 
 - day_of_the_week must be present between [0,6] if not we drop
 - if month is not present between [1,12]
 - duplicates
 - we drop any unique value identifier columns as they don't carry predictive pattern information

In [None]:
df = df[df["amount"] > 0]
df = df[(df["hour"] >= 0) & (df["hour"] <= 23)]
df = df[(df["day_of_week"] >= 0) & (df["day_of_week"] <= 6)]
df = df[(df["month"] >= 1) & (df["month"] <= 12)]
df = df.drop_duplicates()
y = df["is_fraud"].astype(int).reset_index(drop=True)
drop_cols = ["transaction_id","card_number","customer_id","merchant_id","timestamp","fraud_type"]
df = df.drop(columns=drop_cols)

In [84]:
df

Unnamed: 0,amount,merchant_category,merchant_lat,merchant_long,is_fraud,hour,day_of_week,month,distance_from_home
0,1180.76,retail,19.053847,72.816074,0,0,2,1,1162.03
1,85882.91,luxury_goods,26.930554,75.723459,0,0,2,1,905.49
2,70694.10,electronics,26.963388,75.752148,0,0,2,1,1574.21
3,19344.69,luxury_goods,22.524904,88.315280,0,0,2,1,1296.61
4,349.57,grocery,12.984828,77.541353,0,0,2,1,840.01
...,...,...,...,...,...,...,...,...,...
99995,1405.48,retail,28.678187,77.187685,0,23,6,3,1145.08
99996,1177.64,grocery,28.748636,77.129015,1,23,6,3,1769.28
99997,7212.99,retail,13.089348,80.333976,0,23,6,3,1770.24
99998,43779.43,luxury_goods,22.602960,88.419899,0,23,6,3,1577.57


1.3 Feature Engineering

Since time is cricular i.e. 23hours is not far away from 0 hours so cyclical encoding using sine and cosine transformations is applied to preserve circular geometry rather than linear 

In [85]:
def cyclical(col, max_val):
    return (
        np.sin(2*np.pi*df[col]/max_val),
        np.cos(2*np.pi*df[col]/max_val)
    )

df["hour_sin"], df["hour_cos"] = cyclical("hour", 24)
df["day_sin"], df["day_cos"] = cyclical("day_of_week", 7)
df["month_sin"], df["month_cos"] = cyclical("month", 12)

df = df.drop(columns=["hour","day_of_week","month"])


In [86]:
df 

Unnamed: 0,amount,merchant_category,merchant_lat,merchant_long,is_fraud,distance_from_home,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,1180.76,retail,19.053847,72.816074,0,1162.03,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
1,85882.91,luxury_goods,26.930554,75.723459,0,905.49,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
2,70694.10,electronics,26.963388,75.752148,0,1574.21,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
3,19344.69,luxury_goods,22.524904,88.315280,0,1296.61,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
4,349.57,grocery,12.984828,77.541353,0,840.01,0.000000,1.000000,0.974928,-0.222521,0.5,8.660254e-01
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1405.48,retail,28.678187,77.187685,0,1145.08,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17
99996,1177.64,grocery,28.748636,77.129015,1,1769.28,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17
99997,7212.99,retail,13.089348,80.333976,0,1770.24,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17
99998,43779.43,luxury_goods,22.602960,88.419899,0,1577.57,-0.258819,0.965926,-0.781831,0.623490,1.0,6.123234e-17


In [87]:

df["amount_log"] = np.log1p(df["amount"])
df["distance_log"] = np.log1p(df["distance_from_home"])

df = df.drop(columns=["amount","distance_from_home"])


In [94]:
# one-hot encode merchant category. Doing this BEFORE the time split ensures consistent columns.
if "merchant_category" in df.columns:
    df = pd.get_dummies(df, columns=["merchant_category"], drop_first=True)

# drop any remaining non-numeric unexpected columns
non_numeric = df.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric:
    print("Dropping non-numeric columns (unexpected):", non_numeric)
    df = df.drop(columns=non_numeric)


Dropping non-numeric columns (unexpected): ['merchant_category_gas', 'merchant_category_grocery', 'merchant_category_jewelry', 'merchant_category_luxury_goods', 'merchant_category_restaurant', 'merchant_category_retail']


In [95]:
X_df = df.reset_index(drop=True)
y = y.reset_index(drop=True)

n = len(X_df)
split_idx = int(math.floor(n * 0.8))
X_train_df = X_df.iloc[:split_idx].reset_index(drop=True)
X_test_df = X_df.iloc[split_idx:].reset_index(drop=True)
y_train = y.iloc[:split_idx].reset_index(drop=True)
y_test = y.iloc[split_idx:].reset_index(drop=True)


In [96]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_df.values)
X_test_scaled = scaler.transform(X_test_df.values)

# convert back to DataFrame (optional)
X_train = pd.DataFrame(X_train_scaled, columns=X_train_df.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test_df.columns)

In [98]:
# ---------- train Isolation Forest ----------
SEED = 42
FRAUD_RATE = 0.02  # used as contamination param for iso
iso_clf = IsolationForest(
    n_estimators=200,
    contamination=FRAUD_RATE,   # set to expected fraud rate (helps thresholding)
    random_state=SEED,
    n_jobs=-1
)

iso_clf.fit(X_train) 

In [99]:
pred_raw = iso_clf.predict(X_test)
y_pred = (pred_raw == -1).astype(int)  # convert to 0/1 (1 => predicted fraud)

# anomaly score for ranking / ROC AUC: higher -> more anomalous
# decision_function returns higher for more normal; invert it so higher = more anomalous
anomaly_score = -iso_clf.decision_function(X_test)

# metrics
print("\nConfusion Matrix (test):")
print(confusion_matrix(y_test, y_pred))

print("\nClassification report (test):")
print(classification_report(y_test, y_pred, digits=4))

try:
    auc = roc_auc_score(y_test, anomaly_score)
    print(f"\nROC AUC (using anomaly scores): {auc:.4f}")
except ValueError:
    print("\nROC AUC not available (need both classes in y_test).")

# precision/recall at default threshold
prec, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)
print(f"\nPrecision: {prec:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")



Confusion Matrix (test):
[[19168   461]
 [    1   370]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9999    0.9765    0.9881     19629
           1     0.4452    0.9973    0.6156       371

    accuracy                         0.9769     20000
   macro avg     0.7226    0.9869    0.8019     20000
weighted avg     0.9897    0.9769    0.9812     20000


ROC AUC (using anomaly scores): 0.9987

Precision: 0.4452, Recall: 0.9973, F1: 0.6156
