In [1]:
import os
import json
import joblib
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# ---------------- CONFIG ---------------- #
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector"
DATA_PATH = os.path.join(BASE_DIR, "archive", "financial_anomaly_data.csv")

OUT_PRED = os.path.join(BASE_DIR, "predictions.csv")
OUT_JSON = os.path.join(BASE_DIR, "results_summary.json")
OUT_MODEL = os.path.join(BASE_DIR, "anomaly_model.pkl")

# ---------------- LOAD DATA ---------------- #
print(f"[INFO] Loading dataset: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
print("[INFO] Shape:", df.shape)
print("[INFO] Columns:", df.columns.tolist())

# ---------------- FEATURES ---------------- #
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not num_cols:
    raise ValueError("No numeric columns found!")

print("[INFO] Numeric features used:", num_cols)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[num_cols])

# ---------------- TRAIN ANOMALY MODEL ---------------- #
print("[INFO] Training Isolation Forest...")
anom = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
anom.fit(X_scaled)

# Predictions
anom_scores = -anom.decision_function(X_scaled)  # higher = more anomalous
anom_flags = anom.predict(X_scaled)              # -1 = anomaly, 1 = normal

# Add to DataFrame
df["AnomalyScore"] = anom_scores
df["Prediction"] = anom_flags

# ---------------- SAVE MODEL ---------------- #
joblib.dump(anom, OUT_MODEL)
print(f"[INFO] Anomaly model saved: {OUT_MODEL}")

# ---------------- SAVE PREDICTIONS ---------------- #
df.to_csv(OUT_PRED, index=False)
print(f"[INFO] Predictions saved: {OUT_PRED}")

# ---------------- RESULT SUMMARY ---------------- #
summary = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "dataset_shape": df.shape,
    "features_used": num_cols,
    "total_transactions": int(len(df)),
    "anomalies_detected": int((df['Prediction'] == -1).sum()),
    "normal_detected": int((df['Prediction'] == 1).sum()),
    "model": "IsolationForest (unsupervised anomaly detection)",
    "output_files": {
        "predictions_csv": OUT_PRED,
        "anomaly_model": OUT_MODEL
    }
}

with open(OUT_JSON, "w") as f:
    json.dump(summary, f, indent=2)

print(f"[INFO] Summary saved: {OUT_JSON}")
print("\n[RESULT] Summary:")
print(json.dumps(summary, indent=2))


[INFO] Loading dataset: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\archive\financial_anomaly_data.csv
[INFO] Shape: (217441, 7)
[INFO] Columns: ['Timestamp', 'TransactionID', 'AccountID', 'Amount', 'Merchant', 'TransactionType', 'Location']
[INFO] Numeric features used: ['Amount']
[INFO] Training Isolation Forest...
[INFO] Anomaly model saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\anomaly_model.pkl
[INFO] Predictions saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\predictions.csv
[INFO] Summary saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\results_summary.json

[RESULT] Summary:
{
  "timestamp": "2025-09-30 12:45:44",
  "dataset_shape": [
    217441,
    9
  ],
  "features_used": [
    "Amount"
  ],
  "total_transactions": 217441,
  "anomalies_detected": 10870,
  "normal_detected": 206571,
  "model": "IsolationForest (unsupervised anomaly detection)",
  "output_files": {
    "