In [2]:
import os
import json
import yaml
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# ---------------- CONFIG ---------------- #
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector"
DATA_PATH = os.path.join(BASE_DIR, "archive", "financial_anomaly_data.csv")

OUT_H5 = os.path.join(BASE_DIR, "processed_trades.h5")
OUT_ANOMALY = os.path.join(BASE_DIR, "anomaly_model.pkl")
OUT_JSON = os.path.join(BASE_DIR, "finguard_report.json")
OUT_YAML = os.path.join(BASE_DIR, "build_metadata.yaml")

# ---------------- LOAD DATA ---------------- #
print(f"[INFO] Loading dataset: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
print("[INFO] Shape:", df.shape)
print("[INFO] Columns:", df.columns.tolist())

# ---------------- FEATURE ENGINEERING ---------------- #
# Keep numeric fields (Amount, AccountID, TransactionID may be numeric IDs)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not num_cols:
    raise ValueError("No numeric columns found to build features!")

print("[INFO] Numeric features:", num_cols)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[num_cols])

# Save processed data to HDF5
pd.DataFrame(X_scaled, columns=num_cols).to_hdf(OUT_H5, key="trades", mode="w")
print(f"[INFO] Processed data saved: {OUT_H5}")

# ---------------- ANOMALY DETECTOR ---------------- #
print("[INFO] Training Isolation Forest anomaly detector...")
anom = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
anom.fit(X_scaled)

anom_scores = -anom.decision_function(X_scaled)  # higher = more anomalous
anom_flags = anom.predict(X_scaled)  # -1 = anomaly, 1 = normal

joblib.dump(anom, OUT_ANOMALY)
print(f"[INFO] Anomaly model saved: {OUT_ANOMALY}")

# ---------------- REPORT JSON ---------------- #
report = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "dataset_shape": df.shape,
    "features_used": num_cols,
    "anomaly_scores": anom_scores.tolist()[:500],  # save first 500 to keep file size reasonable
    "anomaly_flags": anom_flags.tolist()[:500]
}
with open(OUT_JSON, "w") as f:
    json.dump(report, f, indent=2)
print(f"[INFO] Report saved: {OUT_JSON}")

# ---------------- BUILD METADATA ---------------- #
metadata = {
    "build_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "input_dataset": DATA_PATH,
    "outputs": {
        "processed_data": OUT_H5,
        "anomaly_model": OUT_ANOMALY,
        "report": OUT_JSON,
    },
    "models": {
        "anomaly_detector": "IsolationForest"
    },
    "scaler": "StandardScaler",
    "notes": "Dataset has no fraud labels; only anomaly detection performed."
}
with open(OUT_YAML, "w") as f:
    yaml.dump(metadata, f)
print(f"[INFO] Metadata saved: {OUT_YAML}")

print("[DONE] FinGuard anomaly-only pipeline complete.")


[INFO] Loading dataset: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\archive\financial_anomaly_data.csv
[INFO] Shape: (217441, 7)
[INFO] Columns: ['Timestamp', 'TransactionID', 'AccountID', 'Amount', 'Merchant', 'TransactionType', 'Location']
[INFO] Numeric features: ['Amount']
[INFO] Processed data saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\processed_trades.h5
[INFO] Training Isolation Forest anomaly detector...
[INFO] Anomaly model saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\anomaly_model.pkl
[INFO] Report saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\finguard_report.json
[INFO] Metadata saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\build_metadata.yaml
[DONE] FinGuard anomaly-only pipeline complete.
