In [2]:
# ad_insight_pipeline.py

import os
import json
import yaml
import joblib
import h5py
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# -------- CONFIG --------
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Ad Insight"
TRAIN_CSV = os.path.join(BASE_DIR, "archive", "train.csv")
TEST_CSV  = os.path.join(BASE_DIR, "archive", "test.csv")

OUT_H5    = os.path.join(BASE_DIR, "processed_ads.h5")
OUT_ENG   = os.path.join(BASE_DIR, "engagement_model.pkl")
OUT_MOOD  = os.path.join(BASE_DIR, "mood_model.pkl")
OUT_REC   = os.path.join(BASE_DIR, "recall_model.pkl")
OUT_JSON  = os.path.join(BASE_DIR, "ads_report.json")
OUT_YAML  = os.path.join(BASE_DIR, "build_metadata.yaml")
VISUALS_DIR = os.path.join(BASE_DIR, "visuals")
os.makedirs(VISUALS_DIR, exist_ok=True)


# -------- LOAD DATA --------
print("[INFO] Loading datasets...")
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

print("[INFO] Train shape:", train_df.shape)
print("[INFO] Test shape :", test_df.shape)


# -------- BASIC PREPROCESS --------
def preprocess(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        df[c] = df[c].fillna("UNK")
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])
    return df.fillna(0)

train_df = preprocess(train_df)
test_df = preprocess(test_df)

# -------- SAVE PROCESSED TO H5 --------
print("[INFO] Saving processed data to HDF5...")
with h5py.File(OUT_H5, "w") as h5:
    h5.create_dataset("train", data=train_df.values)
    h5.create_dataset("test", data=test_df.values)
    h5.attrs["train_columns"] = json.dumps(list(train_df.columns))


# -------- SPLIT FEATURES & TARGETS --------
target_cols = ["engagement", "mood", "recall"]
feature_cols = [c for c in train_df.columns if c not in target_cols]

X = train_df[feature_cols].values

y_eng = train_df["engagement"].values if "engagement" in train_df else None
y_mood = train_df["mood"].values if "mood" in train_df else None
y_rec = train_df["recall"].values if "recall" in train_df else None

def split_data(X, y):
    if y is None:
        return None, None, None, None
    return train_test_split(X, y, test_size=0.2, random_state=42)

X_eng_tr, X_eng_val, y_eng_tr, y_eng_val = split_data(X, y_eng)
X_mood_tr, X_mood_val, y_mood_tr, y_mood_val = split_data(X, y_mood)
X_rec_tr, X_rec_val, y_rec_tr, y_rec_val = split_data(X, y_rec)


# -------- TRAIN MODELS --------
def train_model(Xtr, ytr, Xval, yval, out_path, name):
    if ytr is None or yval is None:
        print(f"[WARN] No labels found for {name}, skipping.")
        return None, None, None, None   # always return 4 values

    print(f"[INFO] Training {name} model...")
    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(Xtr, ytr)

    preds = clf.predict(Xval)
    acc = accuracy_score(yval, preds)
    print(f"[RESULT] {name} Accuracy: {acc:.4f}")
    print(classification_report(yval, preds))

    joblib.dump(clf, out_path)
    return clf, yval, preds, acc


eng_model, y_eng_val, eng_preds, acc_eng = train_model(X_eng_tr, y_eng_tr, X_eng_val, y_eng_val, OUT_ENG, "Engagement")
mood_model, y_mood_val, mood_preds, acc_mood = train_model(X_mood_tr, y_mood_tr, X_mood_val, y_mood_val, OUT_MOOD, "Mood")
rec_model, y_rec_val, rec_preds, acc_rec = train_model(X_rec_tr, y_rec_tr, X_rec_val, y_rec_val, OUT_REC, "Recall")


# -------- GENERATE REPORT --------
print("[INFO] Generating JSON report...")
report = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "metrics": {
        "engagement_acc": float(acc_eng) if acc_eng is not None else None,
        "mood_acc": float(acc_mood) if acc_mood is not None else None,
        "recall_acc": float(acc_rec) if acc_rec is not None else None,
    },
    "dataset": {
        "train_shape": train_df.shape,
        "test_shape": test_df.shape,
        "features": feature_cols,
    }
}

with open(OUT_JSON, "w") as f:
    json.dump(report, f, indent=4)


# -------- SAVE BUILD METADATA --------
print("[INFO] Writing YAML metadata...")
metadata = {
    "build": {
        "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "models": {
            "engagement_model": OUT_ENG,
            "mood_model": OUT_MOOD,
            "recall_model": OUT_REC
        },
        "artifacts": {
            "processed_data": OUT_H5,
            "report": OUT_JSON
        }
    }
}

with open(OUT_YAML, "w") as f:
    yaml.dump(metadata, f)


# -------- VISUALIZATIONS --------
print("[INFO] Generating visuals...")

acc_scores = {}
pred_map = {
    "Engagement": (y_eng_val, eng_preds),
    "Mood": (y_mood_val, mood_preds),
    "Recall": (y_rec_val, rec_preds),
}

for name, (y_true, y_pred) in pred_map.items():
    if y_true is None or y_pred is None:
        continue
    acc = accuracy_score(y_true, y_pred)
    acc_scores[name] = acc

    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} Confusion Matrix (Accuracy={acc:.2f})")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    heatmap_path = os.path.join(VISUALS_DIR, f"{name.lower()}_heatmap.png")
    plt.savefig(heatmap_path)
    plt.close()
    print(f"[INFO] Saved heatmap for {name} at {heatmap_path}")

if acc_scores:
    plt.figure(figsize=(6, 4))
    sns.barplot(x=list(acc_scores.keys()), y=list(acc_scores.values()), palette="viridis")
    plt.ylim(0, 1)
    plt.ylabel("Accuracy")
    plt.title("Model Accuracies")
    for i, v in enumerate(acc_scores.values()):
        plt.text(i, v + 0.02, f"{v:.2f}", ha="center", fontsize=10)
    bar_path = os.path.join(VISUALS_DIR, "accuracy_bar.png")
    plt.savefig(bar_path)
    plt.close()
    print(f"[INFO] Saved accuracy bar chart at {bar_path}")

print("[INFO] Pipeline finished successfully ✅")


[INFO] Loading datasets...
[INFO] Train shape: (14999, 9)
[INFO] Test shape : (8764, 8)
[INFO] Saving processed data to HDF5...
[WARN] No labels found for Engagement, skipping.
[WARN] No labels found for Mood, skipping.
[WARN] No labels found for Recall, skipping.
[INFO] Generating JSON report...
[INFO] Writing YAML metadata...
[INFO] Generating visuals...
[INFO] Pipeline finished successfully ✅
