# 2. Unsupervised Fraud Detection: Isolation Forest

Used to detect unknown fraud patterns (anomalies).

Tasks:
1. Load cleaned data.
2. Train Isolation Forest.
3. Generate Anomaly Scores.
4. Save intermediate results.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import pickle

INPUT_FILE = '../data/cleaned_transactions.csv'
OUTPUT_FILE = '../data/cleaned_transactions_with_iso.csv'

df = pd.read_csv(INPUT_FILE)
print(f"Data loaded. Shape: {df.shape}")

In [None]:
# Prepare features for Unsupervised Learning
# We drop 'Class' (target) and 'Time', 'UserID' if they are not useful for simple Anomaly checks or if we want pure behavioral anomalies
# Usually we use V1-V28 and Amount.
feature_cols = [c for c in df.columns if c not in ['Class', 'Time', 'UserID', 'High_Risk_Flag']]
X = df[feature_cols]

# Train Isolation Forest
iso = IsolationForest(
    contamination=0.02, # Approx expected fraud rate or slightly higher to catch outliers
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

print("Training Isolation Forest...")
df['anomaly_prediction'] = iso.fit_predict(X)

# Map: 1 -> 0 (Normal), -1 -> 1 (Suspicious/Anomaly)
df['anomaly_score'] = df['anomaly_prediction'].map({1: 0, -1: 1})

print("Training Complete.")
print(df['anomaly_score'].value_counts())

In [None]:
# Export
df.drop(columns=['anomaly_prediction'], inplace=True)
df.to_csv(OUTPUT_FILE, index=False)
print(f"Data with Anomaly Scores saved to {OUTPUT_FILE}")