In [1]:
import os
import json
import yaml
import joblib
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ---------------- CONFIG ---------------- #
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector"
DATA_PATH = os.path.join(BASE_DIR, "archive", "financial_anomaly_data.csv")

OUT_H5   = os.path.join(BASE_DIR, "processed_trades.h5")
OUT_AIS  = os.path.join(BASE_DIR, "anomaly_model.pkl")
OUT_CSA  = os.path.join(BASE_DIR, "fraud_model.pkl")
OUT_JSON = os.path.join(BASE_DIR, "finguard_report.json")
OUT_YAML = os.path.join(BASE_DIR, "build_metadata.yaml")
OUT_PRED = os.path.join(BASE_DIR, "hybrid_predictions.csv")

# ---------------- LOAD DATA ---------------- #
print(f"[INFO] Loading dataset: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
print("[INFO] Shape:", df.shape)
print("[INFO] Columns:", df.columns.tolist())

# ---------------- FEATURE ENGINEERING ---------------- #
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not num_cols:
    raise ValueError("No numeric columns found!")

print("[INFO] Numeric features used:", num_cols)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[num_cols])

# Save processed data
pd.DataFrame(X_scaled, columns=num_cols).to_hdf(OUT_H5, key="trades", mode="w")
print(f"[INFO] Processed data saved: {OUT_H5}")

# ---------------- AIS: ANOMALY ISOLATION STRATEGY ---------------- #
print("[INFO] Training AIS (IsolationForest)...")
ais = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
ais.fit(X_scaled)

anom_scores = -ais.decision_function(X_scaled)  # higher = more anomalous
anom_flags = ais.predict(X_scaled)              # -1 anomaly, 1 normal
joblib.dump(ais, OUT_AIS)
print(f"[INFO] AIS model saved: {OUT_AIS}")

# ---------------- SYNTHETIC LABELS FOR CSA ---------------- #
# Use top 5% anomaly scores as fraud=1, others=0
threshold = np.percentile(anom_scores, 95)
labels = (anom_scores >= threshold).astype(int)
print(f"[INFO] Synthetic labels created → Fraud: {labels.sum()}, Normal: {len(labels) - labels.sum()}")

# ---------------- CSA: CLASSIFIER SUPERVISED APPROACH ---------------- #
print("[INFO] Training CSA (RandomForest)...")
Xtr, Xte, ytr, yte = train_test_split(X_scaled, labels, test_size=0.2, random_state=42, stratify=labels)

csa = RandomForestClassifier(n_estimators=200, random_state=42)
csa.fit(Xtr, ytr)
ypred = csa.predict(Xte)

print("[RESULT] CSA Report:\n", classification_report(yte, ypred, zero_division=0))
joblib.dump(csa, OUT_CSA)
print(f"[INFO] CSA model saved: {OUT_CSA}")

# ---------------- HYBRID MODEL ---------------- #
fraud_probs = csa.predict_proba(X_scaled)[:, 1]
anom_norm = (anom_scores - anom_scores.min()) / (anom_scores.max() - anom_scores.min())

# Hybrid score: equal weights (can be tuned)
hybrid_score = 0.5 * anom_norm + 0.5 * fraud_probs
hybrid_pred = (hybrid_score > 0.6).astype(int)  # adjustable threshold

# ---------------- SAVE HYBRID PREDICTIONS ---------------- #
df["AIS_AnomalyScore"] = anom_scores
df["CSA_FraudProb"] = fraud_probs
df["HybridScore"] = hybrid_score
df["HybridPrediction"] = hybrid_pred

df.to_csv(OUT_PRED, index=False)
print(f"[INFO] Hybrid predictions saved: {OUT_PRED}")

# ---------------- REPORT JSON ---------------- #
report = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "dataset_shape": df.shape,
    "features_used": num_cols,
    "AIS_model": "IsolationForest",
    "CSA_model": "RandomForestClassifier",
    "hybrid_threshold": 0.6,
    "counts": {
        "total": int(len(df)),
        "AIS_anomalies": int((anom_flags == -1).sum()),
        "CSA_predicted_fraud": int(labels.sum()),
        "Hybrid_predicted_fraud": int(hybrid_pred.sum())
    },
    "outputs": {
        "processed_data": OUT_H5,
        "AIS_model": OUT_AIS,
        "CSA_model": OUT_CSA,
        "hybrid_predictions": OUT_PRED
    }
}
with open(OUT_JSON, "w") as f:
    json.dump(report, f, indent=2)

# ---------------- METADATA YAML ---------------- #
with open(OUT_YAML, "w") as f:
    yaml.dump(report, f)
print(f"[INFO] Metadata saved: {OUT_YAML}")

print("[DONE] Hybrid AIS + CSA pipeline complete.")


[INFO] Loading dataset: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\archive\financial_anomaly_data.csv
[INFO] Shape: (217441, 7)
[INFO] Columns: ['Timestamp', 'TransactionID', 'AccountID', 'Amount', 'Merchant', 'TransactionType', 'Location']
[INFO] Numeric features used: ['Amount']
[INFO] Processed data saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\processed_trades.h5
[INFO] Training AIS (IsolationForest)...
[INFO] AIS model saved: C:\Users\NXTWAVE\Downloads\AI Stock Market Fraud & Anomaly Detector\anomaly_model.pkl
[INFO] Synthetic labels created → Fraud: 10879, Normal: 206562
[INFO] Training CSA (RandomForest)...
[RESULT] CSA Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     41313
           1       1.00      1.00      1.00      2176

    accuracy                           1.00     43489
   macro avg       1.00      1.00      1.00     43489
weighted avg       1.00      1