# 04 — Content-Based Recommender Template
Starter template for teammate 1. Builds user profiles from item features and reuses shared evaluation.


In [None]:
from typing import List

import numpy as np
import pandas as pd

from src import config
from src.evaluation import build_ground_truth, evaluate_topk

USER_COL = config.USER_COL
ITEM_COL = config.ITEM_COL

processed_dir = config.PROCESSED_DATA_DIR
train_df = pd.read_parquet(processed_dir / "train_interactions.parquet")
test_df = pd.read_parquet(processed_dir / "test_interactions.parquet")
item_features = pd.read_parquet(processed_dir / "item_features.parquet")

user_to_items_train = train_df.groupby(USER_COL)[ITEM_COL].apply(list).to_dict()
ground_truth = build_ground_truth(test_df, user_col=USER_COL, item_col=ITEM_COL)
users = list(ground_truth.keys())


In [None]:
class ContentBasedRecommender:
    def __init__(self, item_features_df: pd.DataFrame, user_col: str, item_col: str):
        self.user_col = user_col
        self.item_col = item_col
        self.item_features_df = item_features_df.set_index(item_col)
        self.user_profiles = {}

    def fit(self, train_interactions: pd.DataFrame) -> None:
        # Build user profiles as the mean of interacted item feature vectors.
        for user, user_items in train_interactions.groupby(self.user_col):
            available = [itm for itm in user_items[self.item_col] if itm in self.item_features_df.index]
            if not available:
                continue
            profile = self.item_features_df.loc[available].mean(axis=0)
            self.user_profiles[user] = profile

    def recommend(self, user_id: int, known_items: List[int], k: int) -> List[int]:
        if user_id not in self.user_profiles:
            # cold start: recommend by average scores
            global_scores = self.item_features_df.mean(axis=0).to_numpy()
            base_scores = self.item_features_df.to_numpy() @ global_scores
            sorted_idx = base_scores.argsort()[::-1]
            known_set = set(known_items)
            recs = [self.item_features_df.index[i] for i in sorted_idx if self.item_features_df.index[i] not in known_set]
            return recs[:k]

        profile = self.user_profiles[user_id].to_numpy()
        item_matrix = self.item_features_df.to_numpy()
        scores = item_matrix @ profile

        known_set = set(known_items)
        for idx, item_id in enumerate(self.item_features_df.index):
            if item_id in known_set:
                scores[idx] = -np.inf

        top_idx = scores.argsort()[::-1][:k]
        return [self.item_features_df.index[i] for i in top_idx]


In [None]:
content_model = ContentBasedRecommender(item_features, USER_COL, ITEM_COL)
content_model.fit(train_df)

def content_recommend(user_id, k):
    known = user_to_items_train.get(user_id, [])
    return content_model.recommend(user_id, known, k)

content_results = evaluate_topk(ground_truth, content_recommend, users)
content_results


TODO: refine item features, add normalization/regularization, and experiment with different similarity measures.
