In [1]:
# ==========================================================
# EduVision — Accuracy Graph (epochs) + Feature Heatmap
# Uses MOOC embeddings as X and Topic Vectors (argmax) as y
# Saves plots/CSVs to: C:\Users\sagni\Downloads\Edu Vision
# ==========================================================
from pathlib import Path
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ----------------------------
# Paths (edit if needed)
# ----------------------------
ROOT = Path(r"C:\Users\sagni\Downloads\Edu Vision")
OUT = ROOT
OUT.mkdir(parents=True, exist_ok=True)

# Embeddings (prefer highest dim that exists)
EMB_CANDIDATES = [
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_300d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_200d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_100d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_50d.csv",
]
# Topic vectors (prefer richest)
TOPIC_CANDIDATES = [
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT300.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT200.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT100.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT50.csv",
]

# Optional category files (not required for this script, we rely on topics ArgMax)
FINE_CAT = r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\Fine_grained_Categories.csv"
GEN_CAT  = r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\General_Level_Categories.csv"

# ----------------------------
# Output artifact paths
# ----------------------------
HEATMAP_PNG = OUT / "eduvision_embeddings_corr_heatmap.png"
ACC_PNG     = OUT / "eduvision_accuracy_over_epochs.png"
ACC_CSV     = OUT / "eduvision_accuracy_over_epochs.csv"
CM_PNG      = OUT / "eduvision_confusion_matrix.png"
REPORT_TXT  = OUT / "eduvision_classification_report.txt"

# ----------------------------
# Config
# ----------------------------
TEST_SIZE     = 0.2
RANDOM_STATE  = 42
EPOCHS        = 12
MIN_CLASSES   = 2   # need >=2 classes after processing
PRINT_HEAD    = 5

# ----------------------------
# Helpers
# ----------------------------
def first_existing(paths):
    for p in paths:
        if Path(p).exists():
            return Path(p)
    return None

def read_csv_smart(path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(path, low_memory=False)
    except Exception:
        return pd.read_csv(path, low_memory=False, engine="python")

def numeric_columns(df: pd.DataFrame):
    return [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]

def softmax_argmax_labels(df: pd.DataFrame) -> np.ndarray:
    """
    Use the argmax across numeric topic columns to derive class labels.
    Column naming can be arbitrary; we simply consider all numeric columns as topic scores.
    """
    num_cols = numeric_columns(df)
    if not num_cols:
        raise ValueError("No numeric columns found in topic vectors CSV.")
    X = df[num_cols].astype(float).to_numpy()
    # Argmax over topics -> integer class id
    y = np.argmax(X, axis=1).astype(int)
    return y, num_cols

def summarize(name, df):
    print(f"[INFO] {name}: shape = {df.shape}")
    with pd.option_context("display.width", 140, "display.max_columns", 20):
        print(df.head(PRINT_HEAD))

# ----------------------------
# 1) Load embeddings (X) and topics (for y)
# ----------------------------
emb_path = first_existing(EMB_CANDIDATES)
topic_path = first_existing(TOPIC_CANDIDATES)

if emb_path is None:
    raise SystemExit("[ERROR] No embeddings CSV found. Check your paths.")
if topic_path is None:
    raise SystemExit("[ERROR] No topic vectors CSV found. Check your paths.")

print(f"[INFO] Embeddings file: {emb_path}")
print(f"[INFO] Topic vectors:   {topic_path}")

emb_df = read_csv_smart(emb_path)
topic_df = read_csv_smart(topic_path)
summarize("Embeddings", emb_df)
summarize("Topic Vectors", topic_df)

# ----------------------------
# 2) Align rows (defensive)
#    If lengths mismatch, align to the min length (assumes same ordering).
# ----------------------------
n = min(len(emb_df), len(topic_df))
if len(emb_df) != len(topic_df):
    print(f"[WARN] Row count mismatch: embeddings={len(emb_df)}, topics={len(topic_df)}. "
          f"Truncating to {n} rows (top-aligned).")
emb_df = emb_df.iloc[:n].reset_index(drop=True)
topic_df = topic_df.iloc[:n].reset_index(drop=True)

# ----------------------------
# 3) Build X (from embeddings) and y (argmax topics)
# ----------------------------
emb_num_cols = numeric_columns(emb_df)
if len(emb_num_cols) < 10:
    # Try heuristics: sometimes embeddings have columns named like d0..d299
    # If still too few, error out.
    raise SystemExit("[ERROR] Not enough numeric columns found in embeddings to train a model.")

X_all = emb_df[emb_num_cols].astype(float).to_numpy()
y_all, topic_num_cols = softmax_argmax_labels(topic_df)

# Sanity: Need at least 2 classes
n_classes = len(np.unique(y_all))
if n_classes < MIN_CLASSES:
    raise SystemExit(f"[ERROR] Only {n_classes} class found from topics. Need >= {MIN_CLASSES} to train a classifier.")

print(f"[INFO] Features: {X_all.shape[1]} | Samples: {X_all.shape[0]} | Classes: {n_classes}")

# ----------------------------
# 4) Correlation Heatmap on embeddings
#    (For large dims, this can be dense — still okay for 300D)
# ----------------------------
corr = pd.DataFrame(X_all, columns=emb_num_cols).corr(numeric_only=True)
plt.figure(figsize=(10, 8))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=90, fontsize=7)
plt.yticks(range(corr.shape[0]), corr.index, fontsize=7)
plt.title("EduVision: Embeddings Feature Correlation Heatmap")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(HEATMAP_PNG, dpi=220)
plt.close()
print(f"[SAVED] {HEATMAP_PNG}")

# ----------------------------
# 5) Train/Test split (stratified -> fallback if tiny)
# ----------------------------
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_all
    )
except ValueError as e:
    print(f"[WARN] Stratified split failed ({e}); using non-stratified split.")
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=None
    )

# ----------------------------
# 6) Epoch-wise accuracy with SGD (multinomial logistic)
# ----------------------------
pipe = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SGDClassifier(
        loss="log_loss",
        alpha=1e-4,
        max_iter=1,               # manual "epochs"
        learning_rate="optimal",
        random_state=RANDOM_STATE,
        warm_start=True
    ))
])

train_acc, test_acc = [], []
for ep in range(EPOCHS):
    pipe.fit(X_train, y_train)
    y_tr_pred = pipe.predict(X_train)
    y_te_pred = pipe.predict(X_test)
    train_acc.append(accuracy_score(y_train, y_tr_pred))
    test_acc.append(accuracy_score(y_test, y_te_pred))

# Plot/save accuracy curve
plt.figure(figsize=(8, 4.8))
plt.plot(range(1, EPOCHS+1), train_acc, marker='o', label="Train Accuracy")
plt.plot(range(1, EPOCHS+1), test_acc,  marker='s', label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("EduVision: Accuracy over Epochs (Embeddings → Topic ArgMax)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(ACC_PNG, dpi=220)
plt.close()
print(f"[SAVED] {ACC_PNG}")

pd.DataFrame({
    "epoch": list(range(1, EPOCHS+1)),
    "train_accuracy": train_acc,
    "test_accuracy": test_acc
}).to_csv(ACC_CSV, index=False)
print(f"[SAVED] {ACC_CSV}")

# ----------------------------
# 7) Confusion matrix + report (robust to missing classes)
# ----------------------------
y_pred = pipe.predict(X_test)
labels_present = np.unique(np.concatenate([y_test, y_pred]))
cm = confusion_matrix(y_test, y_pred, labels=labels_present)

plt.figure(figsize=(max(7.5, min(16, 0.5*len(labels_present)+4)), 6.5))
im = plt.imshow(cm, aspect='auto')
plt.title("EduVision: Confusion Matrix (Topic ArgMax labels)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar(im)
plt.xticks(range(len(labels_present)), labels_present, rotation=45, ha='right')
plt.yticks(range(len(labels_present)), labels_present)
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close()
print(f"[SAVED] {CM_PNG}")

report = classification_report(
    y_test, y_pred,
    labels=labels_present,
    target_names=[f"topic_{i}" for i in labels_present],
    digits=4,
    zero_division=0
)
with open(REPORT_TXT, "w", encoding="utf-8") as f:
    f.write("=== EduVision: Embeddings -> Topic ArgMax classification ===\n\n")
    f.write(f"Embeddings file: {emb_path}\n")
    f.write(f"Topic vectors:   {topic_path}\n\n")
    f.write(report + "\n")
print(f"[SAVED] {REPORT_TXT}")

print("\n[DONE] Artifacts saved to:", OUT)


[INFO] Embeddings file: C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_300d.csv
[INFO] Topic vectors:   C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT300.csv
[INFO] Embeddings: shape = (68175, 301)
  Unnamed: 0         0         1         2         3         4         5         6         7         8  ...       290       291       292  \
0      going  0.282224  0.216655 -0.836659 -0.063338 -0.208164  0.144238  1.155843 -0.331597  0.823568  ... -0.030745  0.585437  0.474711   
1       just -0.112434  0.631439 -0.081427  0.288760  0.677108 -0.203446  0.820269 -0.031801 -0.704224  ...  0.021465 -0.480330  0.329963   
2       like  0.631965 -0.453183 -0.256713  0.227004 -0.762930 -0.634983  0.233275  1.635337 -0.974890  ... -0.105838  0.893409  0.058053   
3       time -1.363892 -0.254021 -0.689795 -0.419093  0.412053 -0.395511  0.212262 -0.298002 -1.061591  ... -0.227557  0.692057  0.266910   
4      thing -0.381017  0.863210  0.528545  0.508915  0.237749  0



[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_accuracy_over_epochs.png
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_accuracy_over_epochs.csv
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_confusion_matrix.png
[SAVED] C:\Users\sagni\Downloads\Edu Vision\eduvision_classification_report.txt

[DONE] Artifacts saved to: C:\Users\sagni\Downloads\Edu Vision
