In [None]:
# Cell 1 — Imports & Paths
from pathlib import Path
import numpy as np
import pandas as pd
import joblib, json, re

# === Paths (giữ đúng như bạn đã dùng) ===
DATA_XLSX  = Path("Data_clean/Data_subject_complete.xlsx")   # có cột 'split'
SCALER_P   = Path("2/scaler.joblib")
SUBJECTS_P = Path("3/subjects.json")
INDEX_CSV  = Path("1/index.csv")
MF_PATH    = Path("models_streamlit_mf/index.csv/find-subject-score.joblib")

OUTPUT_XLSX = Path("eval_metrics.xlsx")

print("✅ Paths ready")


In [None]:
# Cell 2 — Load artifacts
subjects = json.loads(SUBJECTS_P.read_text(encoding="utf-8"))
scaler   = joblib.load(SCALER_P)
means = pd.Series(scaler["means"])
stds  = pd.Series(scaler["stds"]).replace(0, 1.0)

# XGB index (target,K -> model_path)
if INDEX_CSV.exists():
    xgb_index = pd.read_csv(INDEX_CSV)
else:
    xgb_index = pd.DataFrame(columns=["target","K","model_path"])

# MF artifacts (optional fallback)
mf_art = joblib.load(MF_PATH) if MF_PATH.exists() else None

col_index = {s:i for i,s in enumerate(subjects)}
print(f"✅ Loaded subjects={len(subjects)} | scaler | xgb_index rows={len(xgb_index)} | mf={'yes' if mf_art is not None else 'no'}")


In [None]:
# Cell 3 — Helpers
def safe_name(text: str) -> str:
    return re.sub(r'[\\/:\"*?<>| ]+', "_", str(text)).strip("_").lower()

def standardize_user_row(user_numeric: dict):
    # user_numeric: {subject -> numeric GPA}
    vals = []
    for s in subjects:
        v = user_numeric.get(s, np.nan)
        if pd.isna(v): vals.append(np.nan)
        else:          vals.append((float(v) - means[s]) / stds[s])
    return np.array(vals, dtype=float)

def build_masked_features(std_row: np.ndarray, kept_subjects: list, add_missing=True):
    vals = std_row.copy()
    mk = np.zeros_like(vals, dtype=bool)
    for s in kept_subjects:
        j = col_index.get(s)
        if j is not None: mk[j] = True
    vals[~mk] = np.nan
    if add_missing:
        miss = (~np.isfinite(vals)).astype(float)
    vals = np.nan_to_num(vals, nan=0.0)
    return np.concatenate([vals, miss], axis=0) if add_missing else vals

def select_xgb_model_path(target: str, K: int):
    df = xgb_index[xgb_index["target"] == target]
    if df.empty:
        return None
    if (df["K"] == K).any():
        return Path(df[df["K"] == K]["model_path"].iloc[0])
    # K gần nhất
    df2 = df.assign(diff=(df["K"] - K).abs()).sort_values("diff")
    return Path(df2.iloc[0]["model_path"])

def predict_mf_for_target(user_numeric: dict, target: str):
    if mf_art is None: return np.nan
    V   = mf_art["V"]        # [n_items, k]
    b_i = mf_art["b_item"]   # [n_items]
    mu  = mf_art["mu"]
    lam = mf_art["lambda"]
    k   = mf_art["k"]
    # standardize user
    std_vals = []
    for s in subjects:
        v = user_numeric.get(s, np.nan)
        if pd.isna(v):
            std_vals.append(np.nan)
        else:
            std_vals.append((float(v) - means[s]) / stds[s])
    std_vals = np.array(std_vals, dtype=float)

    obs_idx = np.where(np.isfinite(std_vals))[0]
    if obs_idx.size == 0:
        return np.nan

    V_K = V[obs_idx]
    r   = std_vals[obs_idx]
    rhs = r - mu - b_i[obs_idx]
    A = V_K.T @ V_K + lam * np.eye(k)
    try:
        u_user = np.linalg.solve(A, V_K.T @ rhs)
    except np.linalg.LinAlgError:
        u_user = np.linalg.pinv(A) @ (V_K.T @ rhs)

    t_idx = col_index[target]
    y_std = mu + b_i[t_idx] + u_user @ V[t_idx]
    y     = y_std * stds[target] + means[target]
    return float(y)

def hybrid_predict_row_target(user_numeric: dict, target: str):
    """
    Trả về (pred, info_dict)
    """
    # K: số môn có điểm (trừ target)
    kept_subjects = [s for s in subjects if (s != target) and pd.notna(user_numeric.get(s, np.nan))]
    K = len(kept_subjects)
    if K < 5:
        return np.nan, {"reason": f"K={K}<5"}

    # build feature cho XGB (chuẩn hoá + mask + indicators)
    std_row = standardize_user_row(user_numeric)
    feats = build_masked_features(std_row, kept_subjects, add_missing=True).reshape(1, -1)

    # XGB
    pred_xgb = np.nan
    model_path = select_xgb_model_path(target, K)
    if model_path is not None and model_path.exists():
        try:
            model = joblib.load(model_path)
            y_std = float(model.predict(feats)[0])
            pred_xgb = y_std * stds[target] + means[target]
        except Exception:
            pred_xgb = np.nan

    # MF
    pred_mf = predict_mf_for_target(user_numeric, target)

    # blend (giống app)
    if np.isfinite(pred_xgb) and np.isfinite(pred_mf):
        w = 0.85 if K <= 10 else 0.70
        pred = w * pred_xgb + (1 - w) * pred_mf
        src = f"blend({w:.2f})"
    elif np.isfinite(pred_xgb):
        pred = pred_xgb
        src = "xgb-only"
    elif np.isfinite(pred_mf):
        pred = pred_mf
        src = "mf-only"
    else:
        pred = np.nan
        src = "none"

    return pred, {"K": K, "src": src, "xgb": pred_xgb, "mf": pred_mf, "model_path": str(model_path) if model_path else None}


In [None]:
# Cell 4 — Load dataset & split
df = pd.read_excel(DATA_XLSX)
assert "split" in df.columns, "Thiếu cột 'split' trong dữ liệu."

df_val  = df[df["split"] == "val"].reset_index(drop=True)
df_test = df[df["split"] == "test"].reset_index(drop=True)
print(f"Rows: val={len(df_val)} | test={len(df_test)} | subjects={len(subjects)}")


In [None]:
# Cell 5 — Eval loop
def row_to_user_numeric(row) -> dict:
    d = {}
    for s in subjects:
        v = row.get(s, np.nan)
        d[s] = np.nan if pd.isna(v) else float(v)
    return d

records = []  # split, row_id, target, y_true, y_pred, K, src

for split_name, df_part in [("val", df_val), ("test", df_test)]:
    for i, row in df_part.iterrows():
        user_numeric = row_to_user_numeric(row)
        # Dự đoán cho từng target (khi có y_true)
        for target in subjects:
            y_true = row.get(target, np.nan)
            if pd.isna(y_true): 
                continue
            pred, info = hybrid_predict_row_target(user_numeric, target)
            records.append({
                "split": split_name,
                "row_id": int(i),
                "target": target,
                "y_true": float(y_true),
                "y_pred": float(pred) if np.isfinite(pred) else np.nan,
                "K": info.get("K", None),
                "src": info.get("src", None),
                "pred_xgb": info.get("xgb", np.nan),
                "pred_mf": info.get("mf", np.nan),
                "model_path": info.get("model_path", None),
            })

pred_df = pd.DataFrame(records)
print(f"Done predictions: {len(pred_df)} rows")
pred_df.head()


In [None]:
# Cell 6 — Metrics & export
def compute_metrics(g: pd.DataFrame):
    g = g.dropna(subset=["y_true", "y_pred"]).copy()
    if len(g) == 0:
        return pd.Series({"n": 0, "mse": np.nan, "rmse": np.nan, "mae": np.nan, "r2": np.nan})
    y = g["y_true"].values
    p = g["y_pred"].values
    mse  = float(np.mean((y - p)**2))
    rmse = float(np.sqrt(mse))
    mae  = float(np.mean(np.abs(y - p)))
    # R^2
    ss_res = np.sum((y - p)**2)
    ss_tot = np.sum((y - np.mean(y))**2)
    r2 = float(1 - ss_res/ss_tot) if ss_tot > 0 else np.nan
    return pd.Series({"n": len(g), "mse": mse, "rmse": rmse, "mae": mae, "r2": r2})

# per-target per-split
per_target_split = pred_df.groupby(["split","target"]).apply(compute_metrics).reset_index()

# overall per-split
overall_split = pred_df.groupby("split").apply(compute_metrics).reset_index()

# overall (val+test)
overall_all = compute_metrics(pred_df)
overall_all = pd.DataFrame([overall_all])
overall_all.insert(0, "split", "val+test")

# optional: phân tích theo nguồn src (xgb-only, mf-only, blend)
by_src_split = pred_df.groupby(["split","src"]).apply(compute_metrics).reset_index()

with pd.ExcelWriter(OUTPUT_XLSX, engine="xlsxwriter") as writer:
    pred_df.to_excel(writer, index=False, sheet_name="predictions_raw")
    per_target_split.to_excel(writer, index=False, sheet_name="metrics_per_target")
    overall_split.to_excel(writer, index=False, sheet_name="metrics_per_split")
    overall_all.to_excel(writer, index=False, sheet_name="metrics_overall")
    by_src_split.to_excel(writer, index=False, sheet_name="metrics_by_src")

print(f"✅ Saved metrics to: {OUTPUT_XLSX.resolve()}")
