# 04 â€” Content-Based Recommender (Experiments)
Content-based hybrids for Steam games using BM25-weighted metadata (genres/tags/categories/developers), dense features, and popularity blending. Includes feature-kNN, hybrid alphas, and heavier LightFM sweeps. Shared evaluation (HitRate/Recall/NDCG) matches teammate CF metrics.


## Approach
- Interaction signal: purchases (`playerid`, `appid`).
- Item representation: BM25 for text fields (`tags`/`categories`/`developers`), bucketed price/owners, base dense features; optional TruncatedSVD.
- Models: popularity baseline; content+pop hybrids (alpha sweep); feature-kNN (precomputed neighbors); LightFM hybrid sweep (factors/epochs/loss).
- Evaluation: HitRate/Recall/NDCG via `src.evaluation.evaluate_model`, excluding seen items.


In [1]:
import os

os.chdir('/home/alyx/Documents/RS/Project')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, hstack
from tqdm import tqdm

from src import config
from src.evaluation import build_ground_truth, evaluate_model
from src.models.popularity import PopularityRecommender
from src.models.content_based import ContentHybridRecommender


In [3]:
# Experiment params
SAMPLE_USERS = 2000
MIN_INTERACTIONS = 10
ALPHAS = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
KNN_NEIGHBORS = [50, 100, 150]
VOCAB_TAGS = 4000
VOCAB_CATEGORIES = 2000
VOCAB_DEVELOPERS = 1000
USE_SVD = True
SVD_COMPONENTS = [128, 256]
BM25_K1 = 1.6
BM25_B = 0.75
# LightFM heavy sweep (optional)
LIGHTFM_FACTORS = [32, 64, 128]
LIGHTFM_EPOCHS = [5, 10]
LIGHTFM_LOSSES = ["warp", "bpr"]
RANDOM_STATE = 42


## Load processed data


In [4]:
USER_COL = config.USER_COL
ITEM_COL = config.ITEM_COL

train_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "train_interactions.parquet")
test_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "test_interactions.parquet")
item_features = pd.read_parquet(config.PROCESSED_DATA_DIR / "item_features.parquet").fillna(0)
games_meta = pd.read_parquet(config.PROCESSED_DATA_DIR / "games_metadata.parquet")

print("Raw shapes:", train_df.shape, test_df.shape, item_features.shape, games_meta.shape)


  item_features = pd.read_parquet(config.PROCESSED_DATA_DIR / "item_features.parquet").fillna(0)


Raw shapes: (9117646, 2) (44021, 2) (89618, 4239) (89618, 47)


## Optional sampling for fast iteration


In [5]:
user_counts = train_df[USER_COL].value_counts()
eligible_users = user_counts[user_counts >= MIN_INTERACTIONS].index

if SAMPLE_USERS:
    rng = np.random.default_rng(RANDOM_STATE)
    sample_size = min(SAMPLE_USERS, len(eligible_users))
    sampled_users = rng.choice(eligible_users, size=sample_size, replace=False)
    train_df = train_df[train_df[USER_COL].isin(sampled_users)].copy()
    test_df = test_df[test_df[USER_COL].isin(sampled_users)].copy()
    print(f"Sampled {sample_size} users -> train {train_df.shape}, test {test_df.shape}")
else:
    train_df = train_df[train_df[USER_COL].isin(eligible_users)].copy()
    test_df = test_df[test_df[USER_COL].isin(eligible_users)].copy()
    print(f"Using all eligible users -> train {train_df.shape}, test {test_df.shape}")


Sampled 2000 users -> train (454394, 2), test (2000, 2)


## Feature helpers (BM25 + dense + optional SVD)


In [6]:
def bm25_block(series: pd.Series, prefix: str, max_features: int, k1: float = BM25_K1, b: float = BM25_B):
    texts = series.fillna("").astype(str).tolist()
    vec = CountVectorizer(max_features=max_features)
    X = vec.fit_transform(texts)
    tf = X
    dl = np.asarray(tf.sum(axis=1)).ravel()
    avg_dl = dl.mean() + 1e-8
    idf = np.log((tf.shape[0] - tf.astype(bool).sum(axis=0) + 0.5) / (tf.astype(bool).sum(axis=0) + 0.5)) + 1
    idf = np.asarray(idf).ravel()
    denom = tf + k1 * (1 - b + b * (dl / avg_dl))[:, None]
    numer = tf.multiply(k1 + 1)
    bm25 = numer.multiply(1 / denom)
    bm25 = bm25.multiply(idf)
    names = [f"{prefix}{t}" for t in vec.get_feature_names_out()]
    return bm25.tocsr(), names


def build_feature_matrix(base_feats: pd.DataFrame, meta: pd.DataFrame, use_svd: bool, svd_components: Optional[int]):
    items = base_feats[ITEM_COL].astype(int).tolist()
    meta_aligned = meta.set_index(ITEM_COL).reindex(base_feats[ITEM_COL]).reset_index()

    blocks = []

    base_dense = csr_matrix(base_feats.drop(columns=[ITEM_COL]).to_numpy(dtype=np.float32))
    blocks.append(base_dense)

    # Price bins
    price_col = config.PRICE_COL
    if price_col in meta_aligned.columns:
        prices = pd.to_numeric(meta_aligned[price_col], errors="coerce").fillna(0)
        bins = [0, 1, 5, 10, 20, 50, 100, np.inf]
        labels = [f"price_bin_{i}" for i in range(len(bins)-1)]
        price_bins = pd.get_dummies(pd.cut(prices, bins=bins, labels=labels, include_lowest=True))
    else:
        price_bins = pd.DataFrame(index=meta_aligned.index)

    # Owners bins
    if "estimated_owners" in meta_aligned.columns:
        owners_raw = meta_aligned["estimated_owners"].fillna("")
        def parse_owner(val):
            if isinstance(val, str) and "-" in val:
                try:
                    low = val.split("-")[0].replace(",", "").strip()
                    return float(low)
                except Exception:
                    return np.nan
            try:
                return float(val)
            except Exception:
                return np.nan
        owners_num = owners_raw.apply(parse_owner)
        bins = [0, 1e3, 1e4, 1e5, 1e6, 1e7, np.inf]
        labels = [f"owners_bin_{i}" for i in range(len(bins)-1)]
        owner_bins = pd.get_dummies(pd.cut(owners_num, bins=bins, labels=labels, include_lowest=True))
    else:
        owner_bins = pd.DataFrame(index=meta_aligned.index)

    extra_dense = pd.concat([price_bins, owner_bins], axis=1).fillna(0)
    blocks.append(csr_matrix(extra_dense.to_numpy(dtype=np.float32)))

    # BM25 text blocks
    if "categories" in meta_aligned.columns:
        mat, _ = bm25_block(meta_aligned["categories"], prefix="cat::", max_features=VOCAB_CATEGORIES)
        blocks.append(mat)
    if "developers" in meta_aligned.columns:
        mat, _ = bm25_block(meta_aligned["developers"], prefix="dev::", max_features=VOCAB_DEVELOPERS)
        blocks.append(mat)
    if "tags" in meta_aligned.columns:
        mat, _ = bm25_block(meta_aligned["tags"], prefix="tag::", max_features=VOCAB_TAGS)
        blocks.append(mat)

    matrix = hstack(blocks).tocsr()

    if use_svd and svd_components:
        svd = TruncatedSVD(n_components=svd_components, random_state=RANDOM_STATE)
        matrix = svd.fit_transform(matrix)
        matrix = normalize(matrix)
        matrix = csr_matrix(matrix)

    matrix = normalize(matrix, norm="l2", axis=1)
    item_to_idx = {iid: i for i, iid in enumerate(items)}
    return items, item_to_idx, matrix


## Prepare features (with SVD options)


In [7]:
items_in_split = set(train_df[ITEM_COL]) | set(test_df[ITEM_COL])
base_feats = item_features[item_features[ITEM_COL].isin(items_in_split)].copy().reset_index(drop=True)
meta_filtered = games_meta[games_meta[ITEM_COL].isin(items_in_split)].copy().reset_index(drop=True)


## Evaluation setup


In [8]:
ground_truth = build_ground_truth(test_df, user_col=USER_COL, item_col=ITEM_COL)
users_eval = list(ground_truth.keys())
known_items_map = train_df.groupby(USER_COL)[ITEM_COL].apply(list).to_dict()
print(f"Users for eval: {len(users_eval)}")


Users for eval: 2000


## Feature-kNN helper (precomputed neighbors)


In [9]:
class PrecomputedFeatureKNN:
    def __init__(self, item_matrix: csr_matrix, item_ids: List[int], item_to_idx: Dict[int, int], max_neighbors: int = 200):
        self.item_matrix = item_matrix
        self.item_ids = item_ids
        self.item_to_idx = item_to_idx
        self.max_neighbors = min(max_neighbors, item_matrix.shape[0]-1)
        knn = NearestNeighbors(metric="cosine", n_neighbors=self.max_neighbors)
        knn.fit(item_matrix)
        distances, neighbors = knn.kneighbors(item_matrix, n_neighbors=self.max_neighbors)
        self.neighbors = neighbors
        self.sims = 1 - distances
        self.default_n_neighbors = self.max_neighbors

    def recommend(self, user_id: int, known_items: List[int], k: int) -> List[int]:
        if not known_items:
            return []
        known_idx = [self.item_to_idx[i] for i in known_items if i in self.item_to_idx]
        if not known_idx:
            return []
        scores = np.zeros(self.item_matrix.shape[0], dtype=np.float32)
        n_use = self.default_n_neighbors
        for idx in known_idx:
            neigh = self.neighbors[idx, :n_use]
            sim = self.sims[idx, :n_use]
            scores[neigh] += sim
        for idx in known_idx:
            scores[idx] = -np.inf
        top_idx = np.argpartition(scores, -k)[-k:]
        top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]
        return [self.item_ids[i] for i in top_idx]


## Sweeps (hybrids, kNN, SVD)


In [10]:
all_results = []
svd_grid = SVD_COMPONENTS if USE_SVD else [None]

for svd_comp in tqdm(svd_grid, desc="SVD settings"):
    print(f"=== Feature store (SVD={svd_comp}) ===")
    item_ids, item_to_idx, item_matrix = build_feature_matrix(
        base_feats=base_feats,
        meta=meta_filtered,
        use_svd=USE_SVD,
        svd_components=svd_comp,
    )
    pop_counts = train_df[ITEM_COL].value_counts()
    pop_ranking = pop_counts.index.tolist()
    pop_scores = np.zeros(len(item_ids), dtype=np.float32)
    max_pop = pop_counts.max()
    for iid, count in pop_counts.items():
        idx = item_to_idx.get(iid)
        if idx is not None:
            pop_scores[idx] = count / max_pop

    # Popularity
    pop_model = PopularityRecommender(item_col=ITEM_COL)
    pop_model.fit(train_df)
    metrics_pop = evaluate_model(pop_model, ground_truth, users_eval, ks=[5, 10, 20], known_items=known_items_map)
    metrics_pop["model"] = "popularity"
    metrics_pop["svd"] = svd_comp
    all_results.append(metrics_pop)

    # Hybrids
    for alpha in tqdm(ALPHAS, desc=f"Alphas (SVD={svd_comp})"):
        model = ContentHybridRecommender(
            item_ids=item_ids,
            item_to_idx=item_to_idx,
            item_matrix=item_matrix,
            pop_scores=pop_scores,
            pop_ranking=pop_ranking,
            user_col=USER_COL,
            item_col=ITEM_COL,
            alpha=alpha,
        )
        model.fit(train_df)
        metrics = evaluate_model(model, ground_truth, users_eval, ks=[5, 10, 20], known_items=known_items_map)
        metrics["model"] = f"hybrid_alpha_{alpha}"
        metrics["svd"] = svd_comp
        all_results.append(metrics)

    # Feature-kNN
    max_k = max(KNN_NEIGHBORS)
    knn_cache = PrecomputedFeatureKNN(item_matrix=item_matrix, item_ids=item_ids, item_to_idx=item_to_idx, max_neighbors=max_k)
    for n_nb in tqdm(KNN_NEIGHBORS, desc=f"kNN (SVD={svd_comp})"):
        knn_cache.default_n_neighbors = min(n_nb, knn_cache.max_neighbors)
        metrics_knn = evaluate_model(knn_cache, ground_truth, users_eval, ks=[5, 10, 20], known_items=known_items_map)
        metrics_knn["model"] = f"feature_knn_{n_nb}"
        metrics_knn["svd"] = svd_comp
        all_results.append(metrics_knn)

all_results_df = pd.concat(all_results)
all_results_pivot = all_results_df.pivot_table(index=["model", "svd"], columns="k", values=["hit_rate", "recall", "ndcg"])
all_results_df.head(), all_results_pivot


SyntaxError: unterminated f-string literal (detected at line 5) (1261904693.py, line 5)

## LightFM heavy sweep (optional, skip if unavailable)


In [None]:
try:
    from lightfm import LightFM
    from lightfm.data import Dataset as LFMDataset

    lfm_ds = LFMDataset()
    lfm_ds.fit(users=train_df[USER_COL].unique(), items=train_df[ITEM_COL].unique())
    interactions, _ = lfm_ds.build_interactions(train_df[[USER_COL, ITEM_COL]].itertuples(index=False, name=None))

    # Use first SVD setting (or None)
    item_ids_lfm, item_to_idx_lfm, item_matrix_lfm = build_feature_matrix(
        base_feats=base_feats,
        meta=meta_filtered,
        use_svd=USE_SVD,
        svd_components=SVD_COMPONENTS[0] if USE_SVD else None,
    )
    lfm_item_features = csr_matrix(item_matrix_lfm)

    user_id_map, user_feature_map, item_id_map, _ = lfm_ds.mapping()
    inv_item_map = {v: k for k, v in item_id_map.items()}

    for loss in LIGHTFM_LOSSES:
        for factors in LIGHTFM_FACTORS:
            for epochs in LIGHTFM_EPOCHS:
                print(f"LightFM loss={loss}, factors={factors}, epochs={epochs}")
                model_lfm = LightFM(loss=loss, no_components=factors, random_state=RANDOM_STATE)
                model_lfm.fit(interactions, item_features=lfm_item_features, epochs=epochs, num_threads=4)

                class LightFMWrapper:
                    def recommend(self, user_id: int, known_items: List[int], k: int) -> List[int]:
                        if user_id not in user_id_map:
                            return []
                        uid = user_id_map[user_id]
                        scores = model_lfm.predict(uid, np.arange(len(inv_item_map)), item_features=lfm_item_features)
                        ranked = np.argsort(-scores)
                        recs = []
                        known_set = set(known_items)
                        for idx in ranked:
                            itm = inv_item_map[idx]
                            if itm in known_set:
                                continue
                            recs.append(itm)
                            if len(recs) >= k:
                                break
                        return recs

                lfm_wrapper = LightFMWrapper()
                metrics_lfm = evaluate_model(lfm_wrapper, ground_truth, users_eval, ks=[5, 10, 20], known_items=known_items_map)
                metrics_lfm["model"] = f"lightfm_{loss}_f{factors}_e{epochs}"
                metrics_lfm["svd"] = SVD_COMPONENTS[0] if USE_SVD else None
                all_results_df = pd.concat([all_results_df, metrics_lfm])
                all_results_pivot = all_results_df.pivot_table(index=["model", "svd"], columns="k", values=["hit_rate", "recall", "ndcg"])
                display(metrics_lfm)
except Exception as e:
    print("LightFM not available or failed:", e)


## Top models summary


In [None]:
primary_k = 10
metric = "ndcg"
summary = (
    all_results_df[all_results_df["k"] == primary_k]
    .sort_values(by=metric, ascending=False)
    .reset_index(drop=True)
)
print("Top 5 models by NDCG@10:")
display(summary.head(5))


## Notes
- For full data, set `SAMPLE_USERS=None` and trim grids if needed.
- BM25 + SVD + hybrid alphas are generally the strongest content baselines; kNN and LightFM are additional comparisons.
- Use the HPC script for large runs; notebook is for experiments/reporting.
