In [2]:
# ad_insight_hybrid_pipeline.py

import os
import json
import yaml
import joblib
import h5py
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import random

# -------- CONFIG --------
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Ad Insight"
TRAIN_CSV = os.path.join(BASE_DIR, "archive", "train.csv")
TEST_CSV  = os.path.join(BASE_DIR, "archive", "test.csv")

OUT_H5    = os.path.join(BASE_DIR, "processed_ads.h5")
OUT_JSON  = os.path.join(BASE_DIR, "ads_report.json")
OUT_YAML  = os.path.join(BASE_DIR, "build_metadata.yaml")
OUT_PRED  = os.path.join(BASE_DIR, "predictions.csv")
VISUALS_DIR = os.path.join(BASE_DIR, "visuals")
os.makedirs(VISUALS_DIR, exist_ok=True)

# -------- LOAD DATA --------
print("[INFO] Loading datasets...")
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)
print("[INFO] Train shape:", train_df.shape)
print("[INFO] Test shape :", test_df.shape)


# -------- PREPROCESS --------
def preprocess(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        df[c] = df[c].fillna("UNK")
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])
    return df.fillna(0)

train_df = preprocess(train_df)
test_df_proc = preprocess(test_df)

# Save HDF5
print("[INFO] Saving processed data to HDF5...")
with h5py.File(OUT_H5, "w") as h5:
    h5.create_dataset("train", data=train_df.values)
    h5.create_dataset("test", data=test_df_proc.values)
    h5.attrs["train_columns"] = json.dumps(list(train_df.columns))


# -------- AUTO-DETECT TARGET COLUMNS --------
possible_targets = [
    col for col in train_df.columns 
    if train_df[col].nunique() < 20 and col.lower() not in ["id"]
]
feature_cols = [c for c in train_df.columns if c not in possible_targets]
X = train_df[feature_cols].values

print(f"[INFO] Auto-detected target columns: {possible_targets}")

# -------- ALIGN TRAIN & TEST FEATURE COLUMNS --------
for col in feature_cols:
    if col not in test_df_proc.columns:
        test_df_proc[col] = 0  # add missing with default value

# Ensure same order
test_df_proc = test_df_proc.reindex(columns=feature_cols, fill_value=0)


# -------- AIS FEATURE SELECTION --------
def ais_feature_selection(X, y, n_iter=20, n_features=10):
    best_acc = 0
    best_features = list(range(X.shape[1]))

    for _ in range(n_iter):
        subset = random.sample(range(X.shape[1]), min(n_features, X.shape[1]))
        Xsub = X[:, subset]

        Xtr, Xval, ytr, yval = train_test_split(Xsub, y, test_size=0.2, random_state=42)
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(Xtr, ytr)
        acc = accuracy_score(yval, clf.predict(Xval))

        if acc > best_acc:
            best_acc = acc
            best_features = subset

    print(f"[AIS] Selected {len(best_features)} features with acc={best_acc:.4f}")
    return best_features


# -------- CSA HYPERPARAM OPTIMIZATION --------
def csa_hyperparam_opt(X, y, n_iter=10):
    best_acc, best_params = 0, {"n_estimators": 100, "max_depth": None}

    for _ in range(n_iter):
        n_est = random.choice([50, 100, 150, 200, 300])
        depth = random.choice([None, 5, 10, 20, 30])
        clf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, random_state=42)

        Xtr, Xval, ytr, yval = train_test_split(X, y, test_size=0.2, random_state=42)
        clf.fit(Xtr, ytr)
        acc = accuracy_score(yval, clf.predict(Xval))

        if acc > best_acc:
            best_acc, best_params = acc, {"n_estimators": n_est, "max_depth": depth}

    print(f"[CSA] Best params: {best_params}, acc={best_acc:.4f}")
    return best_params


# -------- HYBRID TRAINING --------
def hybrid_train(X, y, target_name):
    if y is None:
        return None, None, None, None, None, None

    # AIS feature selection
    selected_features = ais_feature_selection(X, y, n_iter=15, n_features=8)
    Xsel = X[:, selected_features]

    # CSA hyperparam optimization
    best_params = csa_hyperparam_opt(Xsel, y, n_iter=15)

    # Train final model
    Xtr, Xval, ytr, yval = train_test_split(Xsel, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(**best_params, random_state=42)
    clf.fit(Xtr, ytr)

    preds = clf.predict(Xval)
    acc = accuracy_score(yval, preds)

    print(f"[RESULT] {target_name} Hybrid Accuracy: {acc:.4f}")
    print(classification_report(yval, preds))

    model_path = os.path.join(BASE_DIR, f"{target_name.lower()}_hybrid_model.pkl")
    joblib.dump((clf, selected_features, best_params), model_path)

    return clf, yval, preds, acc, model_path, selected_features


results = {}
visual_paths = {}
pred_dfs = []  # store predictions for test.csv


for target in possible_targets:
    y = train_df[target].values
    model, y_true, y_pred, acc, model_path, feats = hybrid_train(X, y, target)
    if model is None:
        continue

    results[target] = {"accuracy": acc, "features": feats, "model_path": model_path}

    # Confusion matrix heatmap
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{target} Confusion Matrix (Hybrid, acc={acc:.2f})")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    heatmap_path = os.path.join(VISUALS_DIR, f"{target.lower()}_heatmap.png")
    plt.savefig(heatmap_path)
    plt.close()
    visual_paths[f"{target}_heatmap"] = heatmap_path

    # ---- PREDICTIONS ON TEST SET ----
    if feats:  # use same selected features
        X_test_sel = test_df_proc.values[:, feats]  # aligned safely
        test_preds = model.predict(X_test_sel)
        pred_dfs.append(pd.DataFrame({f"{target}_pred": test_preds}))


# -------- ACCURACY BAR CHART --------
if results:
    plt.figure(figsize=(6, 4))
    sns.barplot(x=list(results.keys()), y=[r["accuracy"] for r in results.values()], palette="viridis")
    plt.ylim(0, 1)
    plt.ylabel("Accuracy")
    plt.title("Hybrid Model Accuracies")
    for i, v in enumerate([r["accuracy"] for r in results.values()]):
        plt.text(i, v + 0.02, f"{v:.2f}", ha="center", fontsize=10)
    bar_path = os.path.join(VISUALS_DIR, "accuracy_bar.png")
    plt.savefig(bar_path)
    plt.close()
    visual_paths["accuracy_bar"] = bar_path


# -------- SAVE PREDICTIONS --------
if pred_dfs:
    pred_df = pd.concat([test_df.reset_index(drop=True)] + pred_dfs, axis=1)
    pred_df.to_csv(OUT_PRED, index=False)
    print(f"[INFO] Saved predictions at {OUT_PRED}")


# -------- JSON REPORT --------
report = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "metrics": results,
    "dataset": {
        "train_shape": train_df.shape,
        "test_shape": test_df.shape,
        "features": feature_cols,
    },
    "visuals": visual_paths,
    "predictions_file": OUT_PRED
}
with open(OUT_JSON, "w") as f:
    json.dump(report, f, indent=4)


# -------- YAML METADATA --------
metadata = {
    "build": {
        "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "models": {t: results[t]["model_path"] for t in results.keys()},
        "artifacts": {
            "processed_data": OUT_H5,
            "report": OUT_JSON,
            "predictions": OUT_PRED,
            "visuals": VISUALS_DIR
        }
    }
}
with open(OUT_YAML, "w") as f:
    yaml.dump(metadata, f)

print("[INFO] Hybrid pipeline with predictions finished successfully ✅")


[INFO] Loading datasets...
[INFO] Train shape: (14999, 9)
[INFO] Test shape : (8764, 8)
[INFO] Saving processed data to HDF5...
[INFO] Auto-detected target columns: ['category']
[AIS] Selected 8 features with acc=0.9160
[CSA] Best params: {'n_estimators': 100, 'max_depth': 30}, acc=0.9160
[RESULT] category Hybrid Accuracy: 0.9160
              precision    recall  f1-score   support

           0       0.91      0.16      0.27        62
           1       0.83      0.10      0.19       182
           2       0.92      0.83      0.87       137
           3       0.91      1.00      0.95      1489
           4       0.95      0.99      0.97       329
           5       0.90      0.99      0.94       185
           6       0.91      0.99      0.95       519
           7       0.96      0.94      0.95        97

    accuracy                           0.92      3000
   macro avg       0.91      0.75      0.76      3000
weighted avg       0.91      0.92      0.89      3000

[INFO] Saved pred


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(results.keys()), y=[r["accuracy"] for r in results.values()], palette="viridis")
