In [None]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# ===== Optional SVD (очень желательно) =====
try:
    from scipy.sparse import coo_matrix
    from sklearn.decomposition import TruncatedSVD
    HAS_SVD = True
except Exception:
    HAS_SVD = False

SEED = 42
rng = np.random.default_rng(SEED)

# =========================
# CONFIG
# =========================
TOPK_CAND = 20          # сколько первых книг из candidates.book_id_list брать
HOLDOUT_K = 5           # последние K событий на пользователя = "future" для псевдо-теста
MIN_HISTORY = 10        # минимум истории до holdout
NEG_PER_USER = 15       # сколько cold негативов на пользователя (как в задаче)
NEG_POOL = 600          # пул кандидатов для hard-negative выбора
NEG_OVERSAMPLE = 30     # запас для фильтрации использованных книг

# LGBM
N_EST = 6000
LR = 0.03
NUM_LEAVES = 255

# Rating-aware SVD
USE_SVD = True
SVD_DIM = 128
SVD_PLAN_W = 0.25       # вес has_read=0
# has_read=1 вес = 1 + rating/10

BAYES_M = 20            # байесовский приор для среднего рейтинга книги


# =========================
# LOAD (как у тебя)
# =========================
def load_public(base="public/"):
    files = sorted([os.path.join(base, f) for f in os.listdir(base) if f.endswith(".csv")])
    # твой порядок файлов
    candidates = pd.read_csv(files[3])
    train = pd.read_csv(files[7])
    return train, candidates


def explode_candidates(candidates: pd.DataFrame) -> pd.DataFrame:
    cand = candidates[["user_id", "book_id_list"]].copy()
    cand["user_id"] = cand["user_id"].astype(int)
    cand["book_id_list"] = cand["book_id_list"].astype(str).apply(lambda x: ",".join(x.split(",")[:TOPK_CAND]))

    cand = cand.assign(book_id=cand["book_id_list"].str.split(",")).explode("book_id")
    cand["book_id"] = cand["book_id"].astype(str).str.strip()
    cand = cand[cand["book_id"] != ""].copy()
    cand["book_id"] = cand["book_id"].astype(int)

    cand["orig_pos"] = cand.groupby("user_id").cumcount().astype(int)
    cand["orig_pos_inv"] = 1.0 / (1.0 + cand["orig_pos"])
    cand["orig_pos_log"] = np.log1p(cand["orig_pos"])

    return cand[["user_id", "book_id", "orig_pos", "orig_pos_inv", "orig_pos_log"]]


# =========================
# PSEUDO-FUTURE SPLIT
# =========================
def split_history_future(train: pd.DataFrame):
    tr = train.copy()
    tr["timestamp"] = pd.to_datetime(tr["timestamp"], errors="coerce")
    tr = tr.dropna(subset=["timestamp"]).copy()

    tr["user_id"] = tr["user_id"].astype(int)
    tr["book_id"] = tr["book_id"].astype(int)
    tr["has_read"] = tr["has_read"].astype(int)
    tr["rating"] = tr["rating"].astype(int)

    tr = tr.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

    sizes = tr.groupby("user_id").size()
    eligible = sizes[sizes >= (MIN_HISTORY + HOLDOUT_K)].index
    tr = tr[tr["user_id"].isin(eligible)].copy()

    tr["pos_in_user"] = tr.groupby("user_id").cumcount()
    tr["n_in_user"] = tr.groupby("user_id")["book_id"].transform("size")
    tr["is_future"] = tr["pos_in_user"] >= (tr["n_in_user"] - HOLDOUT_K)

    history = tr[~tr["is_future"]].copy()
    future  = tr[ tr["is_future"]].copy()
    return history, future


# =========================
# FEATURE TABLES (rating-aware)
# =========================
def build_feature_tables(events: pd.DataFrame):
    e = events.copy()
    e["timestamp"] = pd.to_datetime(e["timestamp"], errors="coerce")
    e = e.dropna(subset=["timestamp"]).copy()
    e = e.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

    T = e["timestamp"].max()
    read = e[e["has_read"] == 1].copy()
    global_mean = float(read["rating"].mean()) if len(read) else float(e["rating"].mean())

    # user
    u = (e.groupby("user_id")
           .agg(u_cnt=("book_id","count"),
                u_read=("has_read","sum"),
                u_last_ts=("timestamp","max"))
           .reset_index())
    u["u_plan"] = u["u_cnt"] - u["u_read"]
    u["u_read_ratio"] = (u["u_read"] + 1) / (u["u_cnt"] + 2)
    u["u_days_since_last"] = (T - u["u_last_ts"]).dt.total_seconds()/86400.0
    u["u_days_since_last"] = u["u_days_since_last"].fillna(u["u_days_since_last"].median())

    uR = (read.groupby("user_id")["rating"]
            .agg(u_read_mean="mean", u_read_std="std", u_read_cnt="count")
            .reset_index())
    uHi = (read.assign(hi=(read["rating"] >= 8).astype(int))
             .groupby("user_id")["hi"].mean()
             .rename("u_hi_frac")
             .reset_index())
    u = u.merge(uR, on="user_id", how="left").merge(uHi, on="user_id", how="left")
    for c in ["u_read_mean","u_read_std","u_read_cnt","u_hi_frac"]:
        u[c] = u[c].fillna(0)

    # book
    b = (e.groupby("book_id")
           .agg(b_cnt=("user_id","count"),
                b_read=("has_read","sum"),
                b_last_ts=("timestamp","max"))
           .reset_index())
    b["b_plan"] = b["b_cnt"] - b["b_read"]
    b["b_read_ratio"] = (b["b_read"] + 1) / (b["b_cnt"] + 2)
    b["b_pop_log"] = np.log1p(b["b_cnt"])
    b["b_days_since_last"] = (T - b["b_last_ts"]).dt.total_seconds()/86400.0
    b["b_days_since_last"] = b["b_days_since_last"].fillna(b["b_days_since_last"].median())

    bR = (read.groupby("book_id")["rating"]
            .agg(b_read_mean="mean", b_read_cnt="count")
            .reset_index())
    bR["b_bayes_mean"] = (bR["b_read_mean"]*bR["b_read_cnt"] + global_mean*BAYES_M) / (bR["b_read_cnt"] + BAYES_M)

    bHi = (read.assign(hi=(read["rating"] >= 8).astype(int))
             .groupby("book_id")["hi"].mean()
             .rename("b_hi_frac")
             .reset_index())

    b = b.merge(bR, on="book_id", how="left").merge(bHi, on="book_id", how="left")
    for c in ["b_read_mean","b_read_cnt","b_bayes_mean","b_hi_frac"]:
        b[c] = b[c].fillna(0)

    # user-book (только “видел раньше” + давность, без утечки)
    ui = (e.groupby(["user_id","book_id"])
            .agg(ui_cnt=("timestamp","count"),
                 ui_prev_read=("has_read","max"),
                 ui_last_ts=("timestamp","max"))
            .reset_index())
    ui["ui_seen"] = 1
    ui["ui_days_since_last"] = (T - ui["ui_last_ts"]).dt.total_seconds()/86400.0
    ui["ui_days_since_last"] = ui["ui_days_since_last"].fillna(ui["ui_days_since_last"].median())

    uiR = (read.groupby(["user_id","book_id"])["rating"]
             .agg(ui_read_mean="mean", ui_read_cnt="count")
             .reset_index())
    ui = ui.merge(uiR, on=["user_id","book_id"], how="left")
    ui["ui_read_mean"] = ui["ui_read_mean"].fillna(0)
    ui["ui_read_cnt"] = ui["ui_read_cnt"].fillna(0)

    return u, b, ui


# =========================
# Rating-aware SVD (MF-signal)
# =========================
def fit_svd(events: pd.DataFrame):
    if not (USE_SVD and HAS_SVD):
        return None

    e = events[["user_id","book_id","has_read","rating"]].copy()
    e["user_id"] = e["user_id"].astype(int)
    e["book_id"] = e["book_id"].astype(int)
    e["has_read"] = e["has_read"].astype(int)
    e["rating"] = e["rating"].astype(int)

    uid = e["user_id"].unique()
    bid = e["book_id"].unique()
    uid2i = {u:i for i,u in enumerate(uid)}
    bid2i = {b:i for i,b in enumerate(bid)}

    u_idx = e["user_id"].map(uid2i).to_numpy()
    b_idx = e["book_id"].map(bid2i).to_numpy()

    val = np.where(
        e["has_read"].to_numpy() == 1,
        1.0 + e["rating"].to_numpy() / 10.0,
        SVD_PLAN_W
    ).astype(np.float32)

    mat = coo_matrix((val, (u_idx, b_idx)), shape=(len(uid), len(bid))).tocsr()
    k = int(min(SVD_DIM, max(8, min(mat.shape) - 1)))
    svd = TruncatedSVD(n_components=k, random_state=SEED)
    U = svd.fit_transform(mat).astype(np.float32)
    V = svd.components_.T.astype(np.float32)
    return uid2i, bid2i, U, V


def add_svd_features(df_pairs: pd.DataFrame, emb):
    df = df_pairs.copy()
    if emb is None:
        df["svd_score"] = 0.0
        df["svd_tanh"] = 0.0
        return df

    uid2i, bid2i, U, V = emb
    u_idx = df["user_id"].map(uid2i).fillna(-1).astype(int).to_numpy()
    b_idx = df["book_id"].map(bid2i).fillna(-1).astype(int).to_numpy()

    score = np.zeros(len(df), dtype=np.float32)
    mask = (u_idx >= 0) & (b_idx >= 0)
    if mask.any():
        score[mask] = np.sum(U[u_idx[mask]] * V[b_idx[mask]], axis=1)

    df["svd_score"] = score
    df["svd_tanh"] = np.tanh(score).astype(np.float32)
    return df


# =========================
# Hard negatives (top svd_score in a pool)
# =========================
def make_user_used(train_full: pd.DataFrame):
    return train_full.groupby("user_id")["book_id"].apply(lambda s: set(map(int, s.tolist()))).to_dict()

def sample_hard_negs(uid, used_set, book_pool, p, emb, need=NEG_PER_USER):
    sampled = rng.choice(book_pool, size=NEG_POOL * NEG_OVERSAMPLE, replace=True, p=p)
    filt, seen = [], set()
    for b in sampled:
        b = int(b)
        if b in used_set or b in seen:
            continue
        filt.append(b)
        seen.add(b)
        if len(filt) >= NEG_POOL:
            break
    if not filt:
        return np.empty(0, dtype=int)

    tmp = pd.DataFrame({"user_id": int(uid), "book_id": filt})
    tmp = add_svd_features(tmp, emb)
    tmp = tmp.sort_values("svd_score", ascending=False).head(need)
    return tmp["book_id"].to_numpy(dtype=int)


# =========================
# Build LTR dataset
# =========================
def build_rank_dataset(history, future, train_full):
    # positives from future: label=2 if any read else 1
    fut = (future.groupby(["user_id","book_id"])
                 .agg(fut_read=("has_read","max"), fut_rating=("rating","max"))
                 .reset_index())
    fut["label"] = np.where(fut["fut_read"] == 1, 2, 1).astype(int)

    # weights: рейтинг не в label, но усиливаем read-10 сильнее read-1
    fut["weight"] = np.where(fut["fut_read"] == 1, 1.0 + fut["fut_rating"].astype(float)/10.0, 1.0)

    pos = fut[["user_id","book_id","label","weight"]].copy()

    # neg sampling pool: popular books from history
    pop = history.groupby("book_id").size()
    book_pool = pop.index.to_numpy()
    p = np.sqrt(pop.to_numpy(dtype=float))
    p = p / p.sum()

    used_all = make_user_used(train_full)

    emb = fit_svd(history)

    neg_rows = []
    for uid in pos["user_id"].unique():
        used = used_all.get(int(uid), set())
        nb = sample_hard_negs(uid, used, book_pool, p, emb, need=NEG_PER_USER)
        if nb.size:
            neg_rows.append(pd.DataFrame({
                "user_id": int(uid),
                "book_id": nb,
                "label": 0,
                "weight": 1.0
            }))
    neg = pd.concat(neg_rows, ignore_index=True) if neg_rows else pd.DataFrame(columns=["user_id","book_id","label","weight"])

    rank_df = pd.concat([pos, neg], ignore_index=True)
    return rank_df, emb


# =========================
# Merge features + train/eval or train_full
# =========================
def make_feature_matrix(pairs_df, u, b, ui, emb, *, use_baseline_pos=True):
    df = (pairs_df.merge(u.drop(columns=["u_last_ts"]), on="user_id", how="left")
                 .merge(b.drop(columns=["b_last_ts"]), on="book_id", how="left")
                 .merge(ui.drop(columns=["ui_last_ts"]), on=["user_id","book_id"], how="left"))

    df["ui_seen"] = df["ui_seen"].fillna(0).astype(int)
    for c in ["ui_cnt","ui_prev_read","ui_days_since_last","ui_read_mean","ui_read_cnt"]:
        df[c] = df[c].fillna(0)

    for c in ["u_cnt","u_read","u_plan","u_read_ratio","u_days_since_last","u_read_mean","u_read_std","u_read_cnt","u_hi_frac",
              "b_cnt","b_read","b_plan","b_read_ratio","b_pop_log","b_days_since_last","b_read_mean","b_read_cnt","b_bayes_mean","b_hi_frac"]:
        df[c] = df[c].fillna(0)

    df = add_svd_features(df, emb)

    if use_baseline_pos:
        # surrogate "baseline order" inside training lists = rank by svd_score
        df["base_pos"] = (df.groupby("user_id")["svd_score"]
                            .rank(method="first", ascending=False)
                            .astype(int) - 1)
        df["base_pos_inv"] = 1.0 / (1.0 + df["base_pos"])
        df["base_pos_log"] = np.log1p(df["base_pos"])
    else:
        df["base_pos"] = 0
        df["base_pos_inv"] = 1.0
        df["base_pos_log"] = 0.0

    feature_cols = [
        "base_pos", "base_pos_inv", "base_pos_log",
        "svd_score", "svd_tanh",
        "u_cnt","u_read","u_plan","u_read_ratio","u_days_since_last",
        "u_read_mean","u_read_std","u_read_cnt","u_hi_frac",
        "b_cnt","b_read","b_plan","b_read_ratio","b_pop_log","b_days_since_last",
        "b_read_mean","b_read_cnt","b_bayes_mean","b_hi_frac",
        "ui_seen","ui_cnt","ui_prev_read","ui_days_since_last","ui_read_mean","ui_read_cnt",
    ]
    df[feature_cols] = df[feature_cols].fillna(0)
    return df, feature_cols


def train_and_score(rank_df, history):
    u, b, ui = build_feature_tables(history)
    emb = fit_svd(history)

    df, feature_cols = make_feature_matrix(rank_df, u, b, ui, emb, use_baseline_pos=True)

    # split by users
    users_unique = df["user_id"].unique()
    train_u, val_u = train_test_split(users_unique, test_size=0.2, random_state=SEED)

    trp = df[df["user_id"].isin(train_u)].copy().sort_values("user_id", kind="mergesort").reset_index(drop=True)
    vap = df[df["user_id"].isin(val_u)].copy().sort_values("user_id", kind="mergesort").reset_index(drop=True)

    X_train = trp[feature_cols]
    y_train = trp["label"].to_numpy()
    w_train = trp["weight"].to_numpy()
    g_train = trp.groupby("user_id").size().to_numpy()

    X_val = vap[feature_cols]
    y_val = vap["label"].to_numpy()
    w_val = vap["weight"].to_numpy()
    g_val = vap.groupby("user_id").size().to_numpy()

    ranker = lgb.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        ndcg_eval_at=[20],
        n_estimators=N_EST,
        learning_rate=LR,
        num_leaves=NUM_LEAVES,
        min_data_in_leaf=50,
        feature_fraction=0.85,
        bagging_fraction=0.8,
        bagging_freq=1,
        lambda_l2=1.0,
        random_state=SEED,
        n_jobs=-1,
        label_gain=[0, 1, 8],   # read >> plan >> cold
    )

    ranker.fit(
        X_train, y_train,
        group=g_train,
        sample_weight=w_train,
        eval_set=[(X_val, y_val)],
        eval_group=[g_val],
        eval_sample_weight=[w_val],
        callbacks=[lgb.early_stopping(200, verbose=True)],
    )

    best_iter = ranker.best_iteration_ if ranker.best_iteration_ is not None else N_EST
    best_score = ranker.best_score_.get("valid_0", {}).get("ndcg@20", None)

    print(f"\nVAL best_iteration = {best_iter}")
    print(f"VAL best ndcg@20   = {best_score}\n")

    return ranker, best_iter, feature_cols


def train_full(rank_df, history, best_iter, feature_cols):
    u, b, ui = build_feature_tables(history)
    emb = fit_svd(history)

    df, _ = make_feature_matrix(rank_df, u, b, ui, emb, use_baseline_pos=True)
    df = df.sort_values("user_id", kind="mergesort").reset_index(drop=True)

    X = df[feature_cols]
    y = df["label"].to_numpy()
    w = df["weight"].to_numpy()
    g = df.groupby("user_id").size().to_numpy()

    ranker = lgb.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        ndcg_eval_at=[20],
        n_estimators=int(best_iter),
        learning_rate=LR,
        num_leaves=NUM_LEAVES,
        min_data_in_leaf=50,
        feature_fraction=0.85,
        bagging_fraction=0.8,
        bagging_freq=1,
        lambda_l2=1.0,
        random_state=SEED,
        n_jobs=-1,
        label_gain=[0, 1, 8],
        verbose=500
    )

    ranker.fit(X, y, group=g, sample_weight=w)
    return ranker


# =========================
# Predict candidates
# =========================
def predict_candidates(model, feature_cols, train_full, candidates):
    # features from FULL train (это то, что реально доступно на тесте)
    uF, bF, uiF = build_feature_tables(train_full)
    embF = fit_svd(train_full)

    cand = explode_candidates(candidates)

    cand = (cand.merge(uF.drop(columns=["u_last_ts"]), on="user_id", how="left")
               .merge(bF.drop(columns=["b_last_ts"]), on="book_id", how="left")
               .merge(uiF.drop(columns=["ui_last_ts"]), on=["user_id","book_id"], how="left"))

    cand["ui_seen"] = cand["ui_seen"].fillna(0).astype(int)
    for c in ["ui_cnt","ui_prev_read","ui_days_since_last","ui_read_mean","ui_read_cnt"]:
        cand[c] = cand[c].fillna(0)

    for c in ["u_cnt","u_read","u_plan","u_read_ratio","u_days_since_last","u_read_mean","u_read_std","u_read_cnt","u_hi_frac",
              "b_cnt","b_read","b_plan","b_read_ratio","b_pop_log","b_days_since_last","b_read_mean","b_read_cnt","b_bayes_mean","b_hi_frac"]:
        cand[c] = cand[c].fillna(0)

    cand = add_svd_features(cand, embF)

    # IMPORTANT: в тесте baseline-позиция есть! Используем её как base_pos
    cand["base_pos"] = cand["orig_pos"]
    cand["base_pos_inv"] = cand["orig_pos_inv"]
    cand["base_pos_log"] = cand["orig_pos_log"]

    cand[feature_cols] = cand[feature_cols].fillna(0)
    cand["score"] = model.predict(cand[feature_cols])

    sub = (cand.sort_values(["user_id","score"], ascending=[True, False])
              .groupby("user_id")["book_id"]
              .apply(lambda s: ",".join(map(str, s.to_list())))
              .reset_index()
              .rename(columns={"book_id":"book_id_list"}))
    return sub


def main():
    train_df, candidates = load_public("public/")

    history, future = split_history_future(train_df)
    rank_df, _ = build_rank_dataset(history, future, train_df)

    # 1) train+val => score
    _, best_iter, feature_cols = train_and_score(rank_df, history)

    # 2) fit on full rank_df with best_iter
    final_model = train_full(rank_df, history, best_iter, feature_cols)

    # 3) predict
    sub = predict_candidates(final_model, feature_cols, train_df, candidates)
    sub.to_csv("submission_validated.csv", index=False)
    print("Saved submission.csv:", sub.shape)

In [None]:
main()