In [1]:
import os
import json
import yaml
import joblib
import h5py
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# -------- CONFIG --------
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Ad Insight"
TRAIN_CSV = os.path.join(BASE_DIR, "archive", "train.csv")
TEST_CSV  = os.path.join(BASE_DIR, "archive", "test.csv")

OUT_H5    = os.path.join(BASE_DIR, "processed_ads.h5")
OUT_JSON  = os.path.join(BASE_DIR, "ads_report.json")
OUT_YAML  = os.path.join(BASE_DIR, "build_metadata.yaml")
VISUALS_DIR = os.path.join(BASE_DIR, "visuals")
os.makedirs(VISUALS_DIR, exist_ok=True)


# -------- LOAD DATA --------
print("[INFO] Loading datasets...")
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

print("[INFO] Train shape:", train_df.shape)
print("[INFO] Test shape :", test_df.shape)


# -------- BASIC PREPROCESS --------
def preprocess(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        df[c] = df[c].fillna("UNK")
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])
    return df.fillna(0)

train_df = preprocess(train_df)
test_df = preprocess(test_df)

# -------- SAVE PROCESSED TO H5 --------
print("[INFO] Saving processed data to HDF5...")
with h5py.File(OUT_H5, "w") as h5:
    h5.create_dataset("train", data=train_df.values)
    h5.create_dataset("test", data=test_df.values)
    h5.attrs["train_columns"] = json.dumps(list(train_df.columns))


# -------- AUTO-DETECT TARGETS --------
# Assume label columns are categorical with few unique values
possible_targets = [
    col for col in train_df.columns 
    if train_df[col].nunique() < 20 and col not in ["id", "ID"]
]

feature_cols = [c for c in train_df.columns if c not in possible_targets]
X = train_df[feature_cols].values

print(f"[INFO] Auto-detected target columns: {possible_targets}")


# -------- TRAIN MODELS --------
def train_model(X, y, name):
    if y is None:
        print(f"[WARN] No labels for {name}, skipping.")
        return None, None, None, None, None

    Xtr, Xval, ytr, yval = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"[INFO] Training {name} model...")
    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(Xtr, ytr)

    preds = clf.predict(Xval)
    acc = accuracy_score(yval, preds)
    print(f"[RESULT] {name} Accuracy: {acc:.4f}")
    print(classification_report(yval, preds))

    out_path = os.path.join(BASE_DIR, f"{name.lower()}_model.pkl")
    joblib.dump(clf, out_path)

    return clf, yval, preds, acc, out_path


results = {}
visual_paths = {}

for target in possible_targets:
    y = train_df[target].values
    model, y_true, y_pred, acc, model_path = train_model(X, y, target)
    if model is None:
        continue

    results[target] = acc

    # Confusion matrix heatmap
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{target} Confusion Matrix (Accuracy={acc:.2f})")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    heatmap_path = os.path.join(VISUALS_DIR, f"{target.lower()}_heatmap.png")
    plt.savefig(heatmap_path)
    plt.close()
    visual_paths[f"{target}_heatmap"] = heatmap_path
    print(f"[INFO] Saved heatmap for {target} at {heatmap_path}")


# -------- ACCURACY BAR CHART --------
if results:
    plt.figure(figsize=(6, 4))
    sns.barplot(x=list(results.keys()), y=list(results.values()), palette="viridis")
    plt.ylim(0, 1)
    plt.ylabel("Accuracy")
    plt.title("Model Accuracies")
    for i, v in enumerate(results.values()):
        plt.text(i, v + 0.02, f"{v:.2f}", ha="center", fontsize=10)
    bar_path = os.path.join(VISUALS_DIR, "accuracy_bar.png")
    plt.savefig(bar_path)
    plt.close()
    visual_paths["accuracy_bar"] = bar_path
    print(f"[INFO] Saved accuracy bar chart at {bar_path}")


# -------- GENERATE REPORT --------
print("[INFO] Generating JSON report...")
report = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "metrics": results,
    "dataset": {
        "train_shape": train_df.shape,
        "test_shape": test_df.shape,
        "features": feature_cols,
    },
    "visuals": visual_paths
}

with open(OUT_JSON, "w") as f:
    json.dump(report, f, indent=4)


# -------- SAVE BUILD METADATA --------
print("[INFO] Writing YAML metadata...")
metadata = {
    "build": {
        "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "models": {t: os.path.join(BASE_DIR, f"{t.lower()}_model.pkl") for t in results.keys()},
        "artifacts": {
            "processed_data": OUT_H5,
            "report": OUT_JSON,
            "visuals": VISUALS_DIR
        }
    }
}

with open(OUT_YAML, "w") as f:
    yaml.dump(metadata, f)

print("[INFO] Pipeline finished successfully ✅")


[INFO] Loading datasets...
[INFO] Train shape: (14999, 9)
[INFO] Test shape : (8764, 8)
[INFO] Saving processed data to HDF5...
[INFO] Auto-detected target columns: ['category']
[INFO] Training category model...
[RESULT] category Accuracy: 0.9140
              precision    recall  f1-score   support

           0       1.00      0.13      0.23        62
           1       0.88      0.08      0.15       182
           2       0.90      0.80      0.85       137
           3       0.91      1.00      0.96      1489
           4       0.94      0.99      0.96       329
           5       0.90      0.99      0.94       185
           6       0.90      1.00      0.95       519
           7       0.95      0.95      0.95        97

    accuracy                           0.91      3000
   macro avg       0.92      0.74      0.75      3000
weighted avg       0.91      0.91      0.89      3000

[INFO] Saved heatmap for category at C:\Users\NXTWAVE\Downloads\Ad Insight\visuals\category_heatmap.pn


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(results.keys()), y=list(results.values()), palette="viridis")
