In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

# ========= Config =========
DATA_PATH = Path('Data_clean/Data_subject_complete.xlsx')
CORR_PATH = Path('subject/output_correlation_matrix.xlsx')  # optional
TOPN_PATH = Path('subject/output_topN_features_per_target.xlsx')  # optional

OUTPUT_CSV = Path('subject/train_masks_samples.csv')

# Số mẫu mỗi target (điều chỉnh theo tài nguyên)
N_SAMPLES_PER_TARGET = 4000

# Phân phối K (số môn user nhập)
K_VALUES  = [5, 6, 7, 8, 9, 10]
K_PROBS   = [0.35, 0.25, 0.20, 0.12, 0.05, 0.03]  # tổng = 1.0

# Xác suất scenario
SCENARIOS = ['S1', 'S2', 'S3', 'S4']
SCENARIO_PROBS = [0.40, 0.30, 0.20, 0.10]

# Tiers theo ranking tương quan
T1_TOP = 10   # top-10
T2_TOP = 20   # 11-20

RNG_SEED = 2025
rng = np.random.default_rng(RNG_SEED)

def load_subject_matrix():
    if not DATA_PATH.exists():
        raise FileNotFoundError(f'Không thấy file: {DATA_PATH}')
    df = pd.read_excel(DATA_PATH)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) == 0:
        raise ValueError('Không tìm thấy cột numeric nào trong file complete.')
    return df[numeric_cols].copy()

def compute_or_load_corr(subject_df: pd.DataFrame) -> pd.DataFrame:
    if CORR_PATH.exists():
        corr = pd.read_excel(CORR_PATH, index_col=0)
        # align theo cột
        corr = corr.loc[subject_df.columns, subject_df.columns]
        return corr
    return subject_df.corr(method='pearson')

def build_rankings_from_corr(corr: pd.DataFrame) -> dict:
    rankings = {}
    subs = corr.columns.tolist()
    for t in subs:
        s = corr[t].abs().copy()
        s = s.drop(labels=[t], errors='ignore')
        order = s.sort_values(ascending=False).index.tolist()
        rankings[t] = order
    return rankings

def build_tiers_for_target(target: str, ranking: list) -> dict:
    T1 = ranking[:T1_TOP]
    T2 = ranking[T1_TOP:T2_TOP]
    T3 = ranking[T2_TOP:]
    return {'T1': T1, 'T2': T2, 'T3': T3}

def sample_mask_for_target(top_groups: dict, K: int, scenario: str) -> list:
    pick = []
    def take(pool, n):
        nonlocal pick
        n = int(n)
        pool = list(pool)
        pool = [p for p in pool if p not in pick]
        if n > 0 and len(pool) > 0:
            n = min(n, len(pool))
            chosen = rng.choice(pool, size=n, replace=False).tolist()
            pick.extend(chosen)
    if scenario == 'S1':
        take(top_groups['T1'], rng.integers(2, 4))  # 2–3
        need = K - len(pick)
        take(top_groups['T2'], need)
    elif scenario == 'S2':
        take(top_groups['T1'], rng.integers(1, 3))  # 1–2
        need = K - len(pick)
        if need > 0:
            n_t3 = 1 if (rng.random() < 0.2 and len(top_groups['T3']) > 0) else 0
            take(top_groups['T2'], max(0, need - n_t3))
            need = K - len(pick)
            take(top_groups['T3'], need)
    elif scenario == 'S3':
        n_t2 = min(max(2, rng.integers(2, 5)), K)
        take(top_groups['T2'], n_t2)
        need = K - len(pick)
        take(top_groups['T3'], need)
    else:  # S4
        need = K
        if len(top_groups['T3']) > 0:
            take(top_groups['T3'], need)
        need = K - len(pick)
        if need > 0:
            take(top_groups['T2'], need)
        need = K - len(pick)
        if need > 0:
            take(top_groups['T1'], need)
    if len(pick) < K:
        universe = list(set(top_groups['T1'] + top_groups['T2'] + top_groups['T3']) - set(pick))
        need = K - len(pick)
        if len(universe) > 0:
            need = min(need, len(universe))
            pick.extend(rng.choice(universe, size=need, replace=False).tolist())
    return pick[:K]

def main_generate():
    subj_df = load_subject_matrix()
    subjects = subj_df.columns.tolist()
    # Ưu tiên đọc topN nếu có
    rankings = None
    if TOPN_PATH.exists():
        try:
            topn_df = pd.read_excel(TOPN_PATH)
            if {'target', 'top_features'} <= set(topn_df.columns):
                tmp = {}
                for _, row in topn_df.iterrows():
                    t = row['target']
                    if t in subjects:
                        fs = str(row['top_features']).split(',')
                        fs = [f.strip() for f in fs if f.strip() and f.strip() != t and f.strip() in subjects]
                        tmp[t] = fs
                rankings = tmp if len(tmp) > 0 else None
        except Exception:
            rankings = None
    if rankings is None:
        corr = compute_or_load_corr(subj_df)
        rankings = build_rankings_from_corr(corr)

    all_rows = []
    for target in subjects:
        rank = [s for s in rankings.get(target, []) if s in subjects and s != target]
        if len(rank) == 0:
            corr = subj_df.corr(method='pearson')
            rank = build_rankings_from_corr(corr)[target]
        tiers = build_tiers_for_target(target, rank)
        Ks = rng.choice(K_VALUES, size=N_SAMPLES_PER_TARGET, p=K_PROBS)
        scen = rng.choice(SCENARIOS, size=N_SAMPLES_PER_TARGET, p=SCENARIO_PROBS)
        for K, sc in zip(Ks, scen):
            kept = sample_mask_for_target(tiers, int(K), sc)
            overlap = len(set(kept) & set(tiers['T1']))
            all_rows.append({
                'target': target,
                'scenario': sc,
                'K': int(K),
                'kept_subjects': ','.join(kept),
                'overlap': int(overlap)
            })
    out_df = pd.DataFrame(all_rows)
    out_df.to_csv(OUTPUT_CSV, index=False)
    print(f'Đã ghi {len(out_df):,} dòng vào {OUTPUT_CSV.resolve()}')

main_generate()

Đã ghi 124,000 dòng vào C:\Users\vuman\Desktop\AI_Project\Final in HUST\Project\training-find-score-et\2 option\subject\train_masks_samples.csv


In [3]:
# Cell 1 — Config & Imports
from pathlib import Path
import numpy as np
import pandas as pd
import itertools

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# ==== Paths ====
DATA_XLSX  = Path("Data_clean/Data_subject_complete.xlsx")  # phải có cột 'split'
MASKS_CSV  = Path("subject/train_masks_samples.csv")        # sẽ tạo nếu chưa có
CORR_PATH  = Path("subject/output_correlation_matrix.xlsx")  # optional
TOPN_PATH  = Path("subject/output_topN_features_per_target.xlsx")  # optional
BEST_XLSX  = Path("subject/xgb_best_params.xlsx")

# ==== Sampler config (tạo tổ hợp) ====
N_SAMPLES_PER_TARGET = 4000   # số dòng masks sinh ra cho mỗi target
K_VALUES  = [5, 6, 7, 8, 9, 10]
K_PROBS   = [0.35, 0.25, 0.20, 0.12, 0.05, 0.03]
SCENARIOS = ["S1", "S2", "S3", "S4"]
SCENARIO_PROBS = [0.40, 0.30, 0.20, 0.10]
T1_TOP_DEFAULT = 10
T2_TOP_DEFAULT = 20

# ==== Grid search config ====
TRAIN_SAMPLES_PER_TARGET = 8000
VAL_SAMPLES_PER_TARGET   = 2000
ADD_MISSING_INDICATORS   = True

# Random seed (tái lập)
RNG_SEED = 2025
rng = np.random.default_rng(RNG_SEED)

print("✅ Config loaded.")

# Cell 2 — Load data & sanity checks, standardize by TRAIN
df = pd.read_excel(DATA_XLSX)
assert "split" in df.columns, "File dữ liệu phải có cột 'split' (train/val/test)."

# Các cột môn (numeric)
subject_cols = df.select_dtypes(include=[np.number]).columns.tolist()
assert len(subject_cols) > 0, "Không tìm thấy cột numeric nào (môn học)."

# Tách split
df_train = df[df["split"] == "train"].reset_index(drop=True)
df_val   = df[df["split"] == "val"].reset_index(drop=True)

print(f"Rows: train={len(df_train)}, val={len(df_val)} | subjects={len(subject_cols)}")

# Chuẩn hoá z-score theo TRAIN (per subject)
train_means = df_train[subject_cols].mean(axis=0)
train_stds  = df_train[subject_cols].std(axis=0).replace(0, 1.0)

def standardize(df_part: pd.DataFrame) -> pd.DataFrame:
    return (df_part[subject_cols] - train_means) / train_stds

X_train_std = standardize(df_train)
X_val_std   = standardize(df_val)

# Lưu bản gốc nếu cần dùng thêm
X_train_orig = df_train[subject_cols].copy()
X_val_orig   = df_val[subject_cols].copy()

print("✅ Standardization ready.")

# Cell 3 — Helpers: corr, ranking, tiers, sampler
def compute_or_load_corr(subject_df: pd.DataFrame) -> pd.DataFrame:
    if CORR_PATH.exists():
        try:
            corr = pd.read_excel(CORR_PATH, index_col=0)
            corr = corr.loc[subject_df.columns, subject_df.columns]
            return corr
        except Exception:
            pass
    return subject_df.corr(method="pearson")

def build_rankings_from_corr(corr: pd.DataFrame) -> dict:
    rankings = {}
    for t in corr.columns:
        s = corr[t].abs().copy()
        s = s.drop(labels=[t], errors="ignore")
        rankings[t] = s.sort_values(ascending=False).index.tolist()
    return rankings

def build_tiers_for_target(ranking: list, t1_top: int, t2_top: int) -> dict:
    t1 = ranking[:t1_top]
    t2 = ranking[t1_top:t2_top]
    t3 = ranking[t2_top:]
    return {"T1": t1, "T2": t2, "T3": t3}

def sample_mask_for_target(tiers: dict, K: int):
    scenario = rng.choice(SCENARIOS, p=SCENARIO_PROBS)
    pick = []

    def take(pool, n):
        nonlocal pick
        n = int(n)
        pool = [p for p in pool if p not in pick]
        if n > 0 and pool:
            chosen = rng.choice(pool, size=min(n, len(pool)), replace=False).tolist()
            pick.extend(chosen)

    if scenario == "S1":
        # Đẹp: 2–3 từ T1, còn lại T2
        take(tiers["T1"], rng.integers(2, 4))
        take(tiers["T2"], K - len(pick))
    elif scenario == "S2":
        # Vừa: 1–2 từ T1, chủ yếu T2, có thể 0–1 T3
        take(tiers["T1"], rng.integers(1, 3))
        if K - len(pick) > 0:
            n_t3 = 1 if (rng.random() < 0.2 and len(tiers["T3"]) > 0) else 0
            take(tiers["T2"], max(0, K - len(pick) - n_t3))
            take(tiers["T3"], K - len(pick))
    elif scenario == "S3":
        # Xấu: không lấy T1, chủ yếu T2 + T3
        take(tiers["T2"], min(rng.integers(2, 5), K))
        take(tiers["T3"], K - len(pick))
    else:
        # Lạ: ưu tiên T3; thiếu mới lấp T2 rồi T1
        take(tiers["T3"], K)
        take(tiers["T2"], K - len(pick))
        take(tiers["T1"], K - len(pick))

    if len(pick) < K:
        universe = list(set(tiers["T1"] + tiers["T2"] + tiers["T3"]) - set(pick))
        take(universe, K - len(pick))

    return pick[:K], scenario
print("✅ Sampler utilities ready.")

# Cell 5 — Load masks & build feature helpers
masks_df = pd.read_csv(MASKS_CSV)
required_cols = {"target", "scenario", "K", "kept_subjects"}
assert required_cols <= set(masks_df.columns), f"Thiếu cột trong masks CSV: {required_cols - set(masks_df.columns)}"

def parse_kept(s: str):
    return [x.strip() for x in str(s).split(",") if x.strip()]

masks_df["kept_list"] = masks_df["kept_subjects"].apply(parse_kept)

subjects_set = set(subject_cols)
masks_df = masks_df[masks_df["target"].isin(subject_cols)].copy()
masks_df["kept_list"] = masks_df["kept_list"].apply(lambda lst: [s for s in lst if s in subjects_set])

masks_by_target = {t: g.reset_index(drop=True) for t, g in masks_df.groupby("target")}
print(f"✅ Loaded masks for {len(masks_by_target)} targets.")

# Feature builder
col_index = {s: i for i, s in enumerate(subject_cols)}
n_base_feats = len(subject_cols)

def build_features_from_mask(std_row: np.ndarray, kept: list[str], add_missing=True):
    vals = std_row.copy()
    mask_keep = np.zeros_like(vals, dtype=bool)
    for s in kept:
        j = col_index.get(s)
        if j is not None:
            mask_keep[j] = True
    vals[~mask_keep] = np.nan
    if add_missing:
        miss = (~np.isfinite(vals)).astype(float)
    vals = np.nan_to_num(vals, nan=0.0)
    return np.concatenate([vals, miss], axis=0) if add_missing else vals

print("✅ Feature builder ready.")
# Cell 6 — Dataset builder (samples for train/val)
def build_samples_for_target(target: str, split: str, n_samples: int,
                             X_std: pd.DataFrame, X_orig: pd.DataFrame,
                             rng: np.random.Generator):
    assert split in {"train", "val"}
    pool_std  = X_std
    pool_orig = X_orig

    if target not in masks_by_target or len(masks_by_target[target]) == 0:
        return None, None
    mdf = masks_by_target[target]

    row_idx  = rng.integers(0, len(pool_std), size=n_samples)
    mask_idx = rng.integers(0, len(mdf), size=n_samples)

    X_list, y_list = [], []
    t_idx = col_index[target]

    for ri, mi in zip(row_idx, mask_idx):
        std_row = pool_std.iloc[ri].values.astype(float)
        kept = mdf.loc[mi, "kept_list"]
        feats = build_features_from_mask(std_row, kept, add_missing=ADD_MISSING_INDICATORS)
        X_list.append(feats)
        y_list.append(std_row[t_idx])  # target ở thang standardized

    return np.vstack(X_list), np.array(y_list, dtype=float)

print("✅ Dataset builder ready.")
# Cell 7 — Grid Search per target 
param_grid = {
    "max_depth": [3, 4, 5],
    "learning_rate": [0.03, 0.05, 0.07],
    "min_child_weight": [1, 3, 5],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.7, 0.9],
    "reg_lambda": [0, 1, 5],
    "reg_alpha": [0, 0.5],
}

def iter_param_grid(grid: dict):
    keys = list(grid.keys())
    for values in itertools.product(*[grid[k] for k in keys]):
        yield dict(zip(keys, values))

def fit_xgb_legacy_safe(model: XGBRegressor, Xtr, ytr, eval_set):
    """
    Tương thích version cũ:
      1) thử fit với eval_set (nếu version chấp nhận)
      2) nếu lỗi -> fit không eval_set
    """
    try:
        return model.fit(Xtr, ytr, eval_set=eval_set, verbose=False)
    except TypeError:
        # fit không nhận eval_set => fit thuần
        return model.fit(Xtr, ytr, verbose=False)

def best_iteration_of(model: XGBRegressor) -> int:
    # Version cũ có thể không có best_iteration; trả về n_estimators-1
    try:
        if hasattr(model, "best_iteration"):
            return int(model.best_iteration)
    except Exception:
        pass
    return int(getattr(model, "n_estimators", 1) - 1)

def tune_target(target: str):
    # Sinh mẫu train/val cho target này
    Xtr, ytr = build_samples_for_target(target, "train", TRAIN_SAMPLES_PER_TARGET,
                                        X_train_std, X_train_orig, rng)
    Xva, yva = build_samples_for_target(target, "val",   VAL_SAMPLES_PER_TARGET,
                                        X_val_std,   X_val_orig,   rng)
    if Xtr is None or Xva is None:
        return None

    best = None
    eval_set = [(Xva, yva)]
    t_std = float(train_stds[target]) if float(train_stds[target]) != 0 else 1.0

    for p in iter_param_grid(param_grid):
        # Đặt eval_metric trong constructor để hợp mọi version
        model = XGBRegressor(
            n_estimators=800,           # KHÔNG early-stopping -> dùng số cây vừa phải
            objective="reg:squarederror",
            tree_method="hist",
            random_state=RNG_SEED,
            eval_metric="mae",
            **p
        )

        # Fit với nhánh tương thích legacy
        fit_xgb_legacy_safe(model, Xtr, ytr, eval_set)

        # Đánh giá trên val (thang standardized)
        y_pred = model.predict(Xva)
        mae_std = mean_absolute_error(yva, y_pred)
        mae_orig = mae_std * t_std

        cur = {
            "target": target,
            **p,
            "best_iteration": best_iteration_of(model),
            "val_mae_std": float(mae_std),
            "val_mae_orig": float(mae_orig),
            "n_features": Xtr.shape[1],
            "train_samples": len(Xtr),
            "val_samples": len(Xva),
        }
        if (best is None) or (cur["val_mae_std"] < best["val_mae_std"]):
            best = cur
    return best

print("✅ Tuning utilities ready (compatible with older XGBoost; no early-stopping).")

# Cell 8 — Run tuning for all targets & save Excel
BEST_XLSX.parent.mkdir(parents=True, exist_ok=True)

best_rows = []
targets = list(masks_by_target.keys())
print(f"Total targets to tune: {len(targets)}")

for t in targets:
    try:
        b = tune_target(t)
        if b is not None:
            best_rows.append(b)
            print(f"Done: {t} | best val MAE (std) = {b['val_mae_std']:.4f}")
        else:
            print(f"Skip: {t} (no samples)")
    except Exception as e:
        print(f"Error on target {t}: {e}")

best_df = pd.DataFrame(best_rows)
if len(best_df) > 0:
    best_df.sort_values(["val_mae_std", "target"], inplace=True)
    best_df.to_excel(BEST_XLSX, index=False)
    print(f"\n✅ Saved best params to: {BEST_XLSX.resolve()}")
else:
    print("⚠️ No best rows produced. Kiểm tra lại masks hoặc dữ liệu split.")


✅ Config loaded.
Rows: train=630, val=135 | subjects=31
✅ Standardization ready.
✅ Sampler utilities ready.
✅ Loaded masks for 31 targets.
✅ Feature builder ready.
✅ Dataset builder ready.
✅ Tuning utilities ready (compatible with older XGBoost; no early-stopping).
Total targets to tune: 31
Done: Anten và truyền sóng | best val MAE (std) = 0.7849
Done: Cơ sở kỹ thuật đo lường | best val MAE (std) = 0.7294
Done: Cấu kiện điện tử | best val MAE (std) = 0.7522
Done: Cấu trúc dữ liệu và giải thuật | best val MAE (std) = 0.7440
Done: Giải tích I | best val MAE (std) = 0.7347
Done: Giải tích II | best val MAE (std) = 0.7512
Done: Giải tích III | best val MAE (std) = 0.7128
Done: Kỹ thuật lập trình C/C++ | best val MAE (std) = 0.9196
Done: Kỹ thuật phần mềm ứng dụng | best val MAE (std) = 0.7632
Done: Kỹ thuật vi xử lý | best val MAE (std) = 0.7326
Done: Lý thuyết mạch | best val MAE (std) = 0.8049
Done: Lý thuyết thông tin | best val MAE (std) = 0.7567
Done: Nhập môn kỹ thuật điện tử-viễn th

In [1]:
# ==== TRAIN & EXPORT (no checks, optimized path) ====
from pathlib import Path
import re, json, joblib, itertools
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

# ---- Config ----
DATA_XLSX  = Path("Data_clean/Data_subject_complete.xlsx")     # có cột 'split'
MASKS_CSV  = Path("subject/train_masks_samples.csv")           # tổ hợp đã sinh
BEST_XLSX  = Path("subject/xgb_best_params.xlsx")              # best params / target
OUTPUT_MODELS_DIR = Path("models_streamlit_xgb")               # nơi xuất model files
SCALER_PATH = Path("2/scaler.joblib")                  # để inference
SUBJECTS_JSON = Path("3/subjects.json")                # để inference

TRAINVAL_SAMPLES_PER_TARGET_K = 10000   # số sample/target/K
ADD_MISSING_INDICATORS = True
RNG_SEED = 2025
rng = np.random.default_rng(RNG_SEED)

# ---- Helper: safe file name (phong cách bạn) ----
def safe_name(text: str) -> str:
    return re.sub(r'[\\/:\"*?<>| ]+', "_", str(text)).strip("_").lower()

# ---- Load & standardize (z-score theo TRAIN) ----
df = pd.read_excel(DATA_XLSX)
subject_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df_tr  = df[df["split"] == "train"].reset_index(drop=True)
df_va  = df[df["split"] == "val"].reset_index(drop=True)
df_trv = pd.concat([df_tr, df_va], axis=0).reset_index(drop=True)

train_means = df_tr[subject_cols].mean(axis=0)
train_stds  = df_tr[subject_cols].std(axis=0).replace(0, 1.0)
def standardize(df_part: pd.DataFrame) -> pd.DataFrame:
    return (df_part[subject_cols] - train_means) / train_stds

X_trv_std = standardize(df_trv)

# Lưu scaler & subjects cho inference
OUTPUT_MODELS_DIR.mkdir(parents=True, exist_ok=True)
SCALER_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump({"means": train_means.to_dict(), "stds": train_stds.to_dict()}, SCALER_PATH)
Path(SUBJECTS_JSON).write_text(json.dumps(subject_cols, ensure_ascii=False, indent=2), encoding="utf-8")

# ---- Load masks & arrange by target,K ----
masks_df = pd.read_csv(MASKS_CSV)
def parse_kept(s: str):
    return [x.strip() for x in str(s).split(",") if x.strip()]
masks_df = masks_df[masks_df["target"].isin(subject_cols)].copy()
masks_df["kept_list"] = masks_df["kept_subjects"].apply(parse_kept)

masks_by_targetK = {}
for tgt, g in masks_df.groupby("target"):
    dK = {}
    for Kval, gk in g.groupby("K"):
        dK[int(Kval)] = gk.reset_index(drop=True)
    masks_by_targetK[tgt] = dK

# ---- Load best params per target ----
bp = pd.read_excel(BEST_XLSX)
hp_cols = ["max_depth","learning_rate","min_child_weight","subsample","colsample_bytree","reg_lambda","reg_alpha","best_iteration"]
best_params_by_target = {
    row["target"]: {h: row[h] for h in hp_cols} for _, row in bp.iterrows()
}

# ---- Builders ----
col_index = {s:i for i,s in enumerate(subject_cols)}
n_base = len(subject_cols)
def build_features_from_mask(std_row: np.ndarray, kept: list[str], add_missing=True):
    vals = std_row.copy()
    mk = np.zeros_like(vals, dtype=bool)
    for s in kept:
        j = col_index.get(s)
        if j is not None:
            mk[j] = True
    vals[~mk] = np.nan
    if add_missing:
        miss = (~np.isfinite(vals)).astype(float)
    vals = np.nan_to_num(vals, nan=0.0)
    return np.concatenate([vals, miss], axis=0) if add_missing else vals

def build_samples_for_targetK(target: str, K: int, n_samples: int, X_std: pd.DataFrame):
    mdf = masks_by_targetK[target][K]
    ri  = rng.integers(0, len(X_std), size=n_samples)
    mi  = rng.integers(0, len(mdf),   size=n_samples)
    X_list, y_list = [], []
    t_idx = col_index[target]
    for r, m in zip(ri, mi):
        std_row = X_std.iloc[r].values.astype(float)
        kept = mdf.loc[m, "kept_list"]
        X_list.append(build_features_from_mask(std_row, kept, ADD_MISSING_INDICATORS))
        y_list.append(std_row[t_idx])
    return np.vstack(X_list), np.array(y_list, dtype=float)

def make_xgb_from_params(p: dict) -> XGBRegressor:
    n_estimators = int(p.get("best_iteration", 800))
    if n_estimators <= 0:
        n_estimators = 800
    return XGBRegressor(
        n_estimators=n_estimators,
        objective="reg:squarederror",
        tree_method="hist",
        random_state=RNG_SEED,
        eval_metric="mae",
        max_depth=int(p["max_depth"]),
        learning_rate=float(p["learning_rate"]),
        min_child_weight=float(p["min_child_weight"]),
        subsample=float(p["subsample"]),
        colsample_bytree=float(p["colsample_bytree"]),
        reg_lambda=float(p["reg_lambda"]),
        reg_alpha=float(p["reg_alpha"]),
    )

# ---- Train & export (phong cách file của bạn) ----
index_rows = []
for target in subject_cols:
    p = best_params_by_target[target]
    for K in sorted(masks_by_targetK[target].keys()):
        Xtr, ytr = build_samples_for_targetK(target, K, TRAINVAL_SAMPLES_PER_TARGET_K, X_trv_std)
        model = make_xgb_from_params(p)
        model.fit(Xtr, ytr, verbose=False)
        # tên file theo style của bạn: xgb_model_{safe_name}.joblib
        # (gộp 'target__k{K}' vào safe_name để phân biệt biến thể K)
        fname = f"xgb_model_{safe_name(f'{target}__k{K}')}.joblib"
        fpath = OUTPUT_MODELS_DIR / fname
        joblib.dump(model, fpath)
        print(f"  ✅ Đã lưu model: {fpath}")
        index_rows.append({"target": target, "K": int(K), "model_path": str(fpath.as_posix())})

# Ghi sổ địa chỉ index.csv (để inference tra model theo target,K)
pd.DataFrame(index_rows).sort_values(["target","K"]).to_csv(OUTPUT_MODELS_DIR.parent / "index.csv", index=False, encoding="utf-8-sig")
print("\n📇 index.csv đã sẵn sàng:", (OUTPUT_MODELS_DIR.parent / "index.csv").resolve())
print("🧪 scaler:", SCALER_PATH.resolve())
print("📜 subjects:", Path(SUBJECTS_JSON).resolve())


  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_ii__k5.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_ii__k6.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_ii__k7.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_ii__k8.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_ii__k9.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_ii__k10.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_i__k5.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_i__k6.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_i__k7.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_i__k8.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_i__k9.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_giải_tích_i__k10.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_phương_pháp_tính__k5.joblib
  ✅ Đã lưu model: models_streamlit_xgb\xgb_model_ph

In [4]:
# ==== TRAIN MF (ALS-biased) & EXPORT ====
from pathlib import Path
import json, joblib
import numpy as np
import pandas as pd

# ---- Config ----
DATA_XLSX     = Path("Data_clean/Data_subject_complete.xlsx")  # có cột 'split' (không dùng, chỉ đọc toàn bộ rows)
SCALER_PATH   = Path("2/scaler.joblib")                       # đã sinh ở bước XGB
SUBJECTS_JSON = Path("3/subjects.json")                       # đã sinh ở bước XGB
OUTPUT_MF     = Path("models_streamlit_mf/find-subject-score.joblib")                           # nơi lưu MF artifacts

MF_K       = 20        # số latent factors (10–50 thường ổn)
MF_LAMBDA  = 0.10      # regularization
MF_EPOCHS  = 10        # số vòng ALS (6–12 thường đủ)
MF_SEED    = 2025      # random seed

# ---- Load data & metadata ----
df        = pd.read_excel(DATA_XLSX)
subjects  = json.loads(Path(SUBJECTS_JSON).read_text(encoding="utf-8"))
scaler    = joblib.load(SCALER_PATH)
means     = pd.Series(scaler["means"])
stds      = pd.Series(scaler["stds"]).replace(0, 1.0)

# ---- Standardize toàn bộ ma trận điểm theo TRAIN scaler ----
X = df[subjects].copy()
X_std = (X - means) / stds
R = X_std.values.astype(float)            # N_users x N_items
mask = np.isfinite(R)                     # True nơi có dữ liệu gốc
R[~mask] = 0.0                            # NaN -> 0 trên thang chuẩn hoá (mean~0)
N_users, N_items = R.shape

# ---- Init tham số MF ----
rng = np.random.default_rng(MF_SEED)
k   = MF_K
lam = MF_LAMBDA
U   = 0.01 * rng.standard_normal((N_users, k))   # user factors
V   = 0.01 * rng.standard_normal((N_items, k))   # item factors
b_u = np.zeros(N_users)                          # user bias (chuẩn hoá nên nhỏ)
b_i = np.zeros(N_items)                          # item bias
mu  = 0.0                                        # global bias (chuẩn hoá ~ 0)

# ---- ALS updates ----
def solve_user(u_idx):
    K = mask[u_idx]                 # các items quan sát của user u
    if not np.any(K):
        return U[u_idx], b_u[u_idx]
    V_K = V[K]                      # [n_obs, k]
    r   = R[u_idx, K]               # standardized ratings đã fill 0
    rhs = r - mu - b_i[K]
    A   = V_K.T @ V_K + lam * np.eye(k)
    u_new = np.linalg.solve(A, V_K.T @ rhs)
    # bias user = trung bình residual còn lại (nhỏ vì R là z-score)
    res = rhs - V_K @ u_new
    bu_new = res.mean() if res.size > 0 else 0.0
    return u_new, bu_new

def solve_item(i_idx):
    K = mask[:, i_idx]              # các users quan sát item i
    if not np.any(K):
        return V[i_idx], b_i[i_idx]
    U_K = U[K]                      # [n_obs, k]
    r   = R[K, i_idx]
    rhs = r - mu - b_u[K]
    A   = U_K.T @ U_K + lam * np.eye(k)
    v_new = np.linalg.solve(A, U_K.T @ rhs)
    res = rhs - U_K @ v_new
    bi_new = res.mean() if res.size > 0 else 0.0
    return v_new, bi_new

for epoch in range(1, MF_EPOCHS + 1):
    # Update U, b_u
    for u in range(N_users):
        U[u], b_u[u] = solve_user(u)
    # Update V, b_i
    for i in range(N_items):
        V[i], b_i[i] = solve_item(i)
    # Monitor nhanh RMSE trên entries quan sát
    pred = (U @ V.T) + mu + b_u[:, None] + b_i[None, :]
    err  = ( (X_std.values - pred) * mask )
    rmse = np.sqrt( (err**2).sum() / mask.sum() )
    print(f"MF epoch {epoch}/{MF_EPOCHS} | RMSE_std ≈ {rmse:.4f}")

# ---- Save artifacts (đủ để inference) ----
OUTPUT_MF.parent.mkdir(parents=True, exist_ok=True)
mf_artifacts = {
    "V": V,                      # item factors (môn)
    "b_item": b_i,               # item bias
    "mu": float(mu),             # global bias (≈0)
    "k": int(k),
    "lambda": float(lam),
    "subjects": subjects,        # để mapping cột
    "train_means": means.to_dict(),
    "train_stds": stds.to_dict(),
    # (optional) không lưu toàn bộ U/b_u để nhẹ; sẽ fit U_user tại inference
}
joblib.dump(mf_artifacts, OUTPUT_MF)
print(f"\n🎯 Saved MF to: {OUTPUT_MF.resolve()}")


MF epoch 1/10 | RMSE_std ≈ 0.4882
MF epoch 2/10 | RMSE_std ≈ 0.4478
MF epoch 3/10 | RMSE_std ≈ 0.4292
MF epoch 4/10 | RMSE_std ≈ 0.4197
MF epoch 5/10 | RMSE_std ≈ 0.4147
MF epoch 6/10 | RMSE_std ≈ 0.4121
MF epoch 7/10 | RMSE_std ≈ 0.4106
MF epoch 8/10 | RMSE_std ≈ 0.4096
MF epoch 9/10 | RMSE_std ≈ 0.4089
MF epoch 10/10 | RMSE_std ≈ 0.4084

🎯 Saved MF to: C:\Users\vuman\Desktop\AI_Project\Final in HUST\Project\training-find-score-et\2 option\models_streamlit_mf\find-subject-score.joblib


In [1]:
# Cell A — Train GGM (Ledoit-Wolf / GraphicalLassoCV) & export
from pathlib import Path
import numpy as np
import pandas as pd
import joblib, json

from sklearn.covariance import LedoitWolf  # nhanh, ổn định
# from sklearn.covariance import GraphicalLassoCV  # nếu muốn sparse graph

DATA_XLSX  = Path("Data_clean/Data_subject_complete.xlsx")   # có 'split'
SCALER_P   = Path("2/scaler.joblib")                         # đã có từ bước XGB
SUBJECTS_P = Path("3/subjects.json")                         # đã có từ bước XGB
OUT_GGM    = Path("models_streamlit_ggm/ggm.joblib")

# Load data + artifacts
df        = pd.read_excel(DATA_XLSX)
subjects  = json.loads(Path(SUBJECTS_P).read_text(encoding="utf-8"))
scaler    = joblib.load(SCALER_P)
means     = pd.Series(scaler["means"])
stds      = pd.Series(scaler["stds"]).replace(0, 1.0)

# Lấy TRAIN và z-score
df_tr = df[df["split"] == "train"].reset_index(drop=True)
X_tr  = df_tr[subjects].copy()
X_std = (X_tr - means) / stds
X_std = X_std.fillna(0.0).values  # điền mean=0 sau z-score

# Ước lượng covariance
# Cách 1: LedoitWolf (khuyến nghị, nhanh)
est = LedoitWolf().fit(X_std)
cov = est.covariance_

# (Tuỳ chọn) Cách 2: GraphicalLassoCV (chậm hơn, ra precision thưa)
# est = GraphicalLassoCV().fit(X_std)
# cov = est.covariance_
# precision = est.precision_

# Lưu artifacts
OUT_GGM.parent.mkdir(parents=True, exist_ok=True)
ggm_art = {
    "cov": cov,                    # đủ để làm conditional prediction
    # "precision": precision,      # nếu dùng GraphicalLassoCV
    "subjects": subjects,
    "train_means": means.to_dict(),
    "train_stds": stds.to_dict(),
}
joblib.dump(ggm_art, OUT_GGM)
print("✅ Saved GGM to:", OUT_GGM.resolve())


✅ Saved GGM to: C:\Users\vuman\Desktop\AI_Project\Final in HUST\Project\training-find-score-et\2 option\models_streamlit_ggm\ggm.joblib


In [2]:
# Cell B — Quick GGM predict (conditional mean)
import numpy as np
import joblib, json
from pathlib import Path
import pandas as pd

GGM_P      = Path("models_streamlit_ggm/ggm.joblib")
SCALER_P   = Path("2/scaler.joblib")
SUBJECTS_P = Path("3/subjects.json")

ggm   = joblib.load(GGM_P)
subs  = ggm["subjects"]
cov   = np.asarray(ggm["cov"])

scaler = joblib.load(SCALER_P)
means  = pd.Series(scaler["means"])
stds   = pd.Series(scaler["stds"]).replace(0, 1.0)
idx    = {s:i for i,s in enumerate(subs)}

def predict_ggm(user_numeric: dict, target: str):
    # user_numeric: {subject -> GPA} (thang gốc)
    x = []
    O = []
    for s in subs:
        v = user_numeric.get(s, np.nan)
        if pd.isna(v):
            x.append(np.nan)
        else:
            x.append((float(v) - means[s]) / stds[s])  # z-score
            O.append(idx[s])

    if len(O) == 0 or target not in idx: 
        return np.nan, None
    T = idx[target]
    O = np.array([o for o in O if o != T])
    if O.size == 0: 
        return np.nan, None

    S_TO = cov[T, O].reshape(1, -1)
    S_OO = cov[np.ix_(O, O)]
    S_TT = cov[T, T]
    x_O  = np.array([x[o] for o in O])

    try:
        inv_S_OO = np.linalg.inv(S_OO)
    except np.linalg.LinAlgError:
        inv_S_OO = np.linalg.pinv(S_OO)

    y_std   = (S_TO @ inv_S_OO @ (x_O - 0.0)).item()    # mu=0 sau z-score
    var_T_O = float(S_TT - (S_TO @ inv_S_OO @ S_TO.T).item())
    y = y_std * stds[target] + means[target]
    return float(y), max(var_T_O, 1e-9)

# ví dụ dùng:
user = {"Giải tích I": 3.5, "Đại số": 3.0, "Xác suất thống kê": 2.5}
print(predict_ggm(user, "Giải tích II"))


(2.464157843546682, 0.8260493408560979)
