In [1]:
# ==========================================================
# EduVision — Prediction Pipeline + All Graphs & Heatmap
# Trains on MOOC embeddings to predict Topic ArgMax labels.
# Saves: model, predictions, heatmap, accuracy curve, confusion matrix, report
# Output dir: C:\Users\sagni\Downloads\Edu Vision
# ==========================================================
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss
from joblib import dump

# ----------------------------
# Paths (edit if needed)
# ----------------------------
ROOT = Path(r"C:\Users\sagni\Downloads\Edu Vision")
OUT  = ROOT
OUT.mkdir(parents=True, exist_ok=True)

# Prefer highest-dim embeddings / topics available
EMB_CANDIDATES = [
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_300d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_200d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_100d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_50d.csv",
]
TOPIC_CANDIDATES = [
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT300.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT200.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT100.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT50.csv",
]

# ----------------------------
# Artifacts to save
# ----------------------------
HEATMAP_PNG   = OUT / "eduvision_embeddings_corr_heatmap.png"
ACC_PNG       = OUT / "eduvision_accuracy_over_epochs.png"
ACC_CSV       = OUT / "eduvision_accuracy_over_epochs.csv"
CM_PNG        = OUT / "eduvision_confusion_matrix.png"
REPORT_TXT    = OUT / "eduvision_classification_report.txt"
PREDICTIONS_CSV = OUT / "eduvision_predictions.csv"
MODEL_PKL     = OUT / "eduvision_pipeline.pkl"
META_JSON     = OUT / "eduvision_model_meta.json"

# ----------------------------
# Config
# ----------------------------
TEST_SIZE    = 0.2
RANDOM_STATE = 42
EPOCHS       = 12
TOPK_SAVE    = 5           # save top-k predicted classes per row
PRINT_HEAD   = 5

# ----------------------------
# Helpers
# ----------------------------
def first_existing(paths):
    for p in paths:
        if Path(p).exists():
            return Path(p)
    return None

def read_csv_smart(path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(path, low_memory=False)
    except Exception:
        return pd.read_csv(path, low_memory=False, engine="python")

def numeric_columns(df: pd.DataFrame):
    return [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]

def argmax_labels_from_topics(topics_df: pd.DataFrame) -> tuple[np.ndarray, list[str]]:
    num_cols = numeric_columns(topics_df)
    if not num_cols:
        raise ValueError("No numeric columns found in topic vectors CSV.")
    X_topics = topics_df[num_cols].astype(float).to_numpy()
    y = np.argmax(X_topics, axis=1).astype(int)
    return y, num_cols

def summarize(name, df):
    print(f"[INFO] {name}: shape={df.shape}")
    with pd.option_context("display.width", 140, "display.max_columns", 16):
        print(df.head(PRINT_HEAD))

# ----------------------------
# 1) Load data
# ----------------------------
emb_path   = first_existing(EMB_CANDIDATES)
topic_path = first_existing(TOPIC_CANDIDATES)
if emb_path is None:
    raise SystemExit("[ERROR] No embeddings CSV found. Check paths.")
if topic_path is None:
    raise SystemExit("[ERROR] No topic vectors CSV found. Check paths.")

print(f"[INFO] Embeddings:   {emb_path}")
print(f"[INFO] Topic vectors: {topic_path}")

emb_df   = read_csv_smart(emb_path)
topic_df = read_csv_smart(topic_path)
summarize("Embeddings", emb_df)
summarize("Topics", topic_df)

# Align by min length (defensive). If you have IDs, replace with a merge on that ID.
n = min(len(emb_df), len(topic_df))
if len(emb_df) != len(topic_df):
    print(f"[WARN] Row mismatch: embeddings={len(emb_df)}, topics={len(topic_df)}. Truncating to {n}.")
emb_df   = emb_df.iloc[:n].reset_index(drop=True)
topic_df = topic_df.iloc[:n].reset_index(drop=True)

# ----------------------------
# 2) Prepare features X and labels y
# ----------------------------
emb_cols = numeric_columns(emb_df)
if len(emb_cols) < 5:
    raise SystemExit("[ERROR] Not enough numeric columns in embeddings to train.")

X_all = emb_df[emb_cols].astype(float).to_numpy()
y_all, topic_num_cols = argmax_labels_from_topics(topic_df)
classes_all = np.unique(y_all)
if len(classes_all) < 2:
    raise SystemExit("[ERROR] Only one class found from topics. Need >=2 to train.")

print(f"[INFO] Samples: {X_all.shape[0]} | Features: {X_all.shape[1]} | Classes: {len(classes_all)}")

# ----------------------------
# 3) Correlation Heatmap (embeddings)
# ----------------------------
corr = pd.DataFrame(X_all, columns=emb_cols).corr(numeric_only=True)
plt.figure(figsize=(10, 8))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=90, fontsize=7)
plt.yticks(range(corr.shape[0]), corr.index, fontsize=7)
plt.title("EduVision: Embeddings Feature Correlation Heatmap")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(HEATMAP_PNG, dpi=220)
plt.close()
print(f"[SAVED] {HEATMAP_PNG}")

# ----------------------------
# 4) Train/Test split (try stratified, else fallback)
# ----------------------------
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_all
    )
except ValueError as e:
    print(f"[WARN] Stratified split failed ({e}); using non-stratified split.")
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=None
    )

# ----------------------------
# 5) Model (SGDClassifier, multinomial logistic) + accuracy over epochs
# ----------------------------
pipe = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SGDClassifier(
        loss="log_loss",
        alpha=1e-4,
        max_iter=1,                # manual epochs
        learning_rate="optimal",
        random_state=RANDOM_STATE,
        warm_start=True
    ))
])

train_acc, test_acc, test_logloss = [], [], []
for ep in range(EPOCHS):
    pipe.fit(X_train, y_train)
    y_tr = pipe.predict(X_train)
    y_te = pipe.predict(X_test)

    train_acc.append(accuracy_score(y_train, y_tr))
    test_acc.append(accuracy_score(y_test, y_te))

    # log loss if proba available (SGDClassifier with log_loss gives predict_proba)
    try:
        proba = pipe.predict_proba(X_test)
        # ensure columns correspond to pipe.classes_
        test_logloss.append(log_loss(y_test, proba, labels=pipe.named_steps["clf"].classes_))
    except Exception:
        test_logloss.append(np.nan)

# Save accuracy curve PNG + CSV
plt.figure(figsize=(8, 4.8))
plt.plot(range(1, EPOCHS+1), train_acc, marker='o', label="Train Acc")
plt.plot(range(1, EPOCHS+1), test_acc,  marker='s', label="Test Acc")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("EduVision: Accuracy over Epochs (Embeddings → Topic ArgMax)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(ACC_PNG, dpi=220)
plt.close()
print(f"[SAVED] {ACC_PNG}")

pd.DataFrame({
    "epoch": list(range(1, EPOCHS+1)),
    "train_accuracy": train_acc,
    "test_accuracy": test_acc,
    "test_logloss": test_logloss
}).to_csv(ACC_CSV, index=False)
print(f"[SAVED] {ACC_CSV}")

# ----------------------------
# 6) Final predictions on Test, Confusion Matrix, Report
# ----------------------------
y_pred = pipe.predict(X_test)

# Handle possibly missing classes in y_test/y_pred
labels_present = np.unique(np.concatenate([y_test, y_pred]))
cm = confusion_matrix(y_test, y_pred, labels=labels_present)

plt.figure(figsize=(max(7.5, min(16, 0.5*len(labels_present)+4)), 6.5))
im = plt.imshow(cm, aspect='auto')
plt.title("EduVision: Confusion Matrix (Topic ArgMax)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar(im)
plt.xticks(range(len(labels_present)), labels_present, rotation=45, ha='right')
plt.yticks(range(len(labels_present)), labels_present)
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close()
print(f"[SAVED] {CM_PNG}")

report = classification_report(
    y_test, y_pred,
    labels=labels_present,
    target_names=[f"topic_{i}" for i in labels_present],
    digits=4,
    zero_division=0
)
with open(REPORT_TXT, "w", encoding="utf-8") as f:
    f.write("=== EduVision: Embeddings -> Topic ArgMax ===\n\n")
    f.write(f"Embeddings: {emb_path}\nTopics: {topic_path}\n\n")
    f.write(report + "\n")
print(f"[SAVED] {REPORT_TXT}")

# ----------------------------
# 7) Save per-sample predictions CSV (with top-k probs)
# ----------------------------
pred_rows = []
have_proba = False
try:
    proba = pipe.predict_proba(X_test)
    have_proba = True
    cls = pipe.named_steps["clf"].classes_
    for i, (yt, yp) in enumerate(zip(y_test, y_pred)):
        row = {
            "row_index": int(i),
            "true_label": int(yt),
            "pred_label": int(yp),
            "correct": bool(yt == yp),
        }
        # top-k
        topk = min(TOPK_SAVE, proba.shape[1])
        idxs = np.argsort(proba[i])[::-1][:topk]
        for rank, j in enumerate(idxs, 1):
            row[f"top{rank}_class"] = int(cls[j])
            row[f"top{rank}_proba"] = float(proba[i, j])
        pred_rows.append(row)
except Exception:
    # Fallback: no probabilities available
    for i, (yt, yp) in enumerate(zip(y_test, y_pred)):
        pred_rows.append({
            "row_index": int(i),
            "true_label": int(yt),
            "pred_label": int(yp),
            "correct": bool(yt == yp),
        })

pd.DataFrame(pred_rows).to_csv(PREDICTIONS_CSV, index=False)
print(f"[SAVED] {PREDICTIONS_CSV} (proba={have_proba})")

# ----------------------------
# 8) Save model + metadata
# ----------------------------
dump(pipe, MODEL_PKL)
print(f"[SAVED] {MODEL_PKL}")

meta = {
    "embeddings_file": str(emb_path),
    "topics_file": str(topic_path),
    "n_samples": int(X_all.shape[0]),
    "n_features": int(X_all.shape[1]),
    "n_classes": int(len(classes_all)),
    "test_size": TEST_SIZE,
    "random_state": RANDOM_STATE,
    "epochs": EPOCHS,
    "topk_saved": TOPK_SAVE,
    "have_predict_proba": bool(have_proba),
    "metrics": {
        "final_train_accuracy": float(train_acc[-1]),
        "final_test_accuracy": float(test_acc[-1]),
        "final_test_logloss": float(test_logloss[-1]) if len(test_logloss) and not np.isnan(test_logloss[-1]) else None
    }
}
with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"[SAVED] {META_JSON}")

print("\n[DONE] All artifacts saved in:", OUT)


[INFO] Embeddings:   C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_300d.csv
[INFO] Topic vectors: C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT300.csv
[INFO] Embeddings: shape=(68175, 301)
  Unnamed: 0         0         1         2         3         4         5         6  ...       292       293       294       295       296  \
0      going  0.282224  0.216655 -0.836659 -0.063338 -0.208164  0.144238  1.155843  ...  0.474711 -0.689789  0.030715  1.123113  0.119392   
1       just -0.112434  0.631439 -0.081427  0.288760  0.677108 -0.203446  0.820269  ...  0.329963 -0.402903  0.168842  0.380508 -0.987448   
2       like  0.631965 -0.453183 -0.256713  0.227004 -0.762930 -0.634983  0.233275  ...  0.058053  0.674091  0.925095  0.914846  0.048320   
3       time -1.363892 -0.254021 -0.689795 -0.419093  0.412053 -0.395511  0.212262  ...  0.266910 -0.145627  0.734603 -0.001361 -0.661998   
4      thing -0.381017  0.863210  0.528545  0.508915  0.237749  0.277165

  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_accuracy_over_epochs.png
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_accuracy_over_epochs.csv
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_confusion_matrix.png
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_classification_report.txt
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_predictions.csv (proba=True)
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_pipeline.pkl
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_model_meta.json

[DONE] All artifacts saved in: C:\Users\sagni\Downloads\Edu Vision
