In [6]:
# ==== EXPORT SUBJECTS & SCALER (no model training) ====
from pathlib import Path
import json, joblib
import numpy as np
import pandas as pd

# ---- Config (gi·ªØ nguy√™n nh∆∞ b·∫°n ƒëang d√πng) ----
DATA_XLSX   = Path("Data_clean/Data_subject_complete.xlsx")  # c√≥ c·ªôt 'split'
SCALER_PATH = Path("2/scaler.joblib")                        # n∆°i l∆∞u scaler
SUBJECTS_JSON = Path("3/subjects.json")                      # n∆°i l∆∞u danh s√°ch m√¥n

# ---- Load data ----
df = pd.read_excel(DATA_XLSX)

# L·∫•y c√°c c·ªôt s·ªë l√†m "subjects"
subject_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not subject_cols:
    raise ValueError("Kh√¥ng t√¨m th·∫•y c·ªôt s·ªë n√†o trong file d·ªØ li·ªáu. H√£y ki·ªÉm tra DATA_XLSX.")

# Ch·ªâ d√πng TRAIN ƒë·ªÉ t√≠nh scaler (best practice)
if "split" not in df.columns:
    raise ValueError("Thi·∫øu c·ªôt 'split' trong d·ªØ li·ªáu. C·∫ßn c√≥ ƒë·ªÉ t√°ch train/val/test.")
df_tr = df[df["split"] == "train"].reset_index(drop=True)
if df_tr.empty:
    raise ValueError("T·∫≠p TRAIN r·ªóng. H√£y ki·ªÉm tra gi√° tr·ªã c·ªôt 'split'.")

# ---- Compute z-score stats theo TRAIN ----
train_means = df_tr[subject_cols].mean(axis=0)
train_stds  = df_tr[subject_cols].std(axis=0).replace(0, 1.0)  # tr√°nh chia cho 0

# ---- Save artifacts ----
SCALER_PATH.parent.mkdir(parents=True, exist_ok=True)
Path(SUBJECTS_JSON).parent.mkdir(parents=True, exist_ok=True)

joblib.dump({"means": train_means.to_dict(), "stds": train_stds.to_dict()}, SCALER_PATH)
Path(SUBJECTS_JSON).write_text(json.dumps(subject_cols, ensure_ascii=False, indent=2), encoding="utf-8")

print("‚úÖ Exported:")
print("  üß™ scaler:", SCALER_PATH.resolve())
print("  üìú subjects:", Path(SUBJECTS_JSON).resolve())
print(f"  (#subjects = {len(subject_cols)})")


‚úÖ Exported:
  üß™ scaler: C:\Users\vuman\Desktop\AI_Project\Final in HUST\Project\training-find-score-ee\2\scaler.joblib
  üìú subjects: C:\Users\vuman\Desktop\AI_Project\Final in HUST\Project\training-find-score-ee\3\subjects.json
  (#subjects = 26)


In [7]:
# Cell A ‚Äî Train GGM (Ledoit-Wolf / GraphicalLassoCV) & export
from pathlib import Path
import numpy as np
import pandas as pd
import joblib, json

from sklearn.covariance import LedoitWolf  # nhanh, ·ªïn ƒë·ªãnh
# from sklearn.covariance import GraphicalLassoCV  # n·∫øu mu·ªën sparse graph

DATA_XLSX  = Path("Data_clean/Data_subject_complete.xlsx")   # c√≥ 'split'
SCALER_P   = Path("2/scaler.joblib")                         
SUBJECTS_P = Path("3/subjects.json")                         
OUT_GGM    = Path("models_streamlit/EE2_ggm.joblib")

# Load data + artifacts
df        = pd.read_excel(DATA_XLSX)
subjects  = json.loads(Path(SUBJECTS_P).read_text(encoding="utf-8"))
scaler    = joblib.load(SCALER_P)
means     = pd.Series(scaler["means"])
stds      = pd.Series(scaler["stds"]).replace(0, 1.0)

# L·∫•y TRAIN v√† z-score
df_tr = df[df["split"] == "train"].reset_index(drop=True)
X_tr  = df_tr[subjects].copy()
X_std = (X_tr - means) / stds
X_std = X_std.fillna(0.0).values  # ƒëi·ªÅn mean=0 sau z-score

# ∆Ø·ªõc l∆∞·ª£ng covariance
# C√°ch 1: LedoitWolf (khuy·∫øn ngh·ªã, nhanh)
est = LedoitWolf().fit(X_std)
cov = est.covariance_

# (Tu·ª≥ ch·ªçn) C√°ch 2: GraphicalLassoCV (ch·∫≠m h∆°n, ra precision th∆∞a)
# est = GraphicalLassoCV().fit(X_std)
# cov = est.covariance_
# precision = est.precision_

# L∆∞u artifacts
OUT_GGM.parent.mkdir(parents=True, exist_ok=True)
ggm_art = {
    "cov": cov,                    # ƒë·ªß ƒë·ªÉ l√†m conditional prediction
    # "precision": precision,      # n·∫øu d√πng GraphicalLassoCV
    "subjects": subjects,
    "train_means": means.to_dict(),
    "train_stds": stds.to_dict(),
}
joblib.dump(ggm_art, OUT_GGM)
print("‚úÖ Saved GGM to:", OUT_GGM.resolve())


‚úÖ Saved GGM to: C:\Users\vuman\Desktop\AI_Project\Final in HUST\Project\training-find-score-ee\models_streamlit\EE2_ggm.joblib
