# Unsupervised Fraud Detection (BiGuard)
Time-based train/test split without using labels for training. Labels are only revealed for final evaluation.

In [None]:

# Imports
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

EXCEL_PATH = 'fraud_unsupervised.xlsx'


In [None]:

# Load sheets
tx = pd.read_excel(EXCEL_PATH, sheet_name='transactions')
labels = pd.read_excel(EXCEL_PATH, sheet_name='labels')  # id -> is_fraudulent

# Basic cleaning
tx['date'] = pd.to_datetime(tx['date'], errors='coerce')
assert tx['date'].isna().sum() == 0, 'There are missing/invalid dates.'

tx['amount'] = pd.to_numeric(tx['amount'], errors='coerce').abs()
tx['category'] = tx['category'].fillna('Unknown')
tx['merchant_name'] = tx['merchant_name'].fillna('')
tx['name'] = tx['name'].fillna('')
tx['is_expense'] = tx['is_expense'].fillna(1).astype(int)

print('Rows:', len(tx))
print('Time range:', tx['date'].min().date(), '->', tx['date'].max().date())


In [None]:

# Feature engineering (no labels used)
def featurize(df):
    out = pd.DataFrame()
    out['amount'] = df['amount'].values
    out['amount_log'] = np.log(df['amount'].values + 1.0)
    out['amount_sqrt'] = np.sqrt(df['amount'].values)
    out['is_expense'] = df['is_expense'].astype(int).values
    out['day_of_week'] = df['date'].dt.weekday.values
    out['day_of_month'] = df['date'].dt.day.values
    out['month'] = df['date'].dt.month.values
    out['merchant_length'] = df['merchant_name'].astype(str).str.len().values
    out['has_numbers'] = df['merchant_name'].astype(str).str.contains(r'\d').astype(int).values
    out['category_hash'] = df['category'].astype(str).apply(lambda s: hash(s)%1000).values
    return out

X_all = featurize(tx)
X_all.head()


In [None]:

# Time-based split (train on earlier dates, test on later dates)
cutoff = tx['date'].quantile(0.8)  # 80% earliest for train
train_mask = tx['date'] <= cutoff
test_mask = tx['date'] > cutoff

X_train, X_test = X_all[train_mask].reset_index(drop=True), X_all[test_mask].reset_index(drop=True)
tx_train, tx_test = tx[train_mask].reset_index(drop=True), tx[test_mask].reset_index(drop=True)

print('Train rows:', len(X_train), '| Test rows:', len(X_test))
print('Cutoff date:', cutoff.date())


In [None]:

# Scale
scaler = StandardScaler().fit(X_train)
Xs_train = scaler.transform(X_train)
Xs_test = scaler.transform(X_test)


In [None]:

# Train IsolationForest (unsupervised)
iso = IsolationForest(n_estimators=256, contamination=0.09, random_state=42)
iso.fit(Xs_train)

# Optional DBSCAN for combined signal
dbscan_train = DBSCAN(eps=0.7, min_samples=8).fit(Xs_train)


In [None]:

# Predict anomalies on TEST ONLY
iso_pred = (iso.predict(Xs_test) == -1).astype(int)  # 1 = anomaly
iso_score = iso.decision_function(Xs_test)           # higher = more normal

# Combined rule (unsupervised): IF + DBSCAN + amount threshold
clusters_test = DBSCAN(eps=0.7, min_samples=8).fit_predict(Xs_test)
combined = []
for i in range(len(X_test)):
    ai = 0
    if iso_pred[i] == 1: ai += 1
    if clusters_test[i] == -1: ai += 1
    if X_test.loc[i, 'amount'] > 5000: ai += 1
    combined.append(1 if ai >= 2 else 0)


In [None]:

# Reveal labels for evaluation (after predictions)
test_labels = tx_test[['id']].merge(labels, on='id', how='left')['is_fraudulent'].fillna(0).astype(int).values

# Metrics
def metric_block(y_true, y_pred, scores=None):
    out = {
        'accuracy': float(accuracy_score(y_true, y_pred)),
        'precision': float(precision_score(y_true, y_pred, zero_division=0)),
        'recall': float(recall_score(y_true, y_pred, zero_division=0)),
        'f1': float(f1_score(y_true, y_pred, zero_division=0)),
        'confusion_matrix': confusion_matrix(y_true, y_pred).tolist(),
        'classification_report': classification_report(y_true, y_pred, zero_division=0)
    }
    if scores is not None:
        inv = -scores  # invert so higher means more anomalous
        out['roc_auc'] = float(roc_auc_score(y_true, inv))
        out['average_precision'] = float(average_precision_score(y_true, inv))
    return out

results = {
    'IsolationForest': metric_block(test_labels, iso_pred, iso_score),
    'Combined': metric_block(test_labels, np.array(combined))
}
results


In [None]:

# Plot confusion matrix for IsolationForest
cm = confusion_matrix(test_labels, iso_pred)

import matplotlib.pyplot as plt
plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix - IsolationForest')
plt.colorbar()
tick_marks = [0, 1]
plt.xticks(tick_marks, ['Legit', 'Fraud'])
plt.yticks(tick_marks, ['Legit', 'Fraud'])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'), horizontalalignment="center")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()
