In [1]:
import numpy as np
import pandas as pd
import joblib

from scipy.sparse import csr_matrix


In [2]:
# Load trained ALS model (from Notebook 05)
als_model = joblib.load("../models/als_tuned_model.pkl")

# Load interaction data
interactions = pd.read_csv("../data/processed/interactions.csv")

# Load item-user matrix (USER × ITEM)
item_user_train = joblib.load("../data/processed/item_user_train.pkl")

# Load encoders
movie_encoder = joblib.load("../models/movie_encoder.pkl")
user_encoder = joblib.load("../models/user_encoder.pkl")


In [15]:
# One held-out interaction per user
test_interactions = (
    interactions
    .groupby("user_idx", group_keys=False)
    .apply(lambda x: x.sample(1, random_state=42))
    .reset_index(drop=True)
)

# Keep only users present in ALS matrix
n_users_model = item_user_train.shape[0]
test_interactions = test_interactions[
    test_interactions["user_idx"] < n_users_model
].copy()

# Filter very weak users (important)
min_interactions = 5
user_counts = interactions["user_idx"].value_counts()
valid_users = user_counts[user_counts >= min_interactions].index

test_interactions = test_interactions[
    test_interactions["user_idx"].isin(valid_users)
]

print("Evaluation samples:", len(test_interactions))


Evaluation samples: 6034


In [16]:
movies = pd.read_csv(
    "../data/raw/movies.dat",
    sep="::",
    engine="python",
    encoding="latin-1",
    names=["movie_id", "title", "genres"]
)

movies["genres"] = movies["genres"].str.split("|")

# Map movie_id → ALS index
movie_id_to_idx = {
    movie_id: idx
    for idx, movie_id in enumerate(movie_encoder.classes_)
}

movies = movies[movies["movie_id"].isin(movie_id_to_idx)]
movies["movie_idx"] = movies["movie_id"].map(movie_id_to_idx)



In [17]:
all_genres = sorted({g for gs in movies["genres"] for g in gs})
genre_to_idx = {g: i for i, g in enumerate(all_genres)}

n_items = item_user_train.shape[1]
genre_matrix = np.zeros((n_items, len(all_genres)), dtype=np.float32)

for _, row in movies.iterrows():
    item_idx = row["movie_idx"]
    for g in row["genres"]:
        genre_matrix[item_idx, genre_to_idx[g]] = 1.0



In [18]:
def recommend_als_rerank_with_content(
    user_idx,
    als_model,
    user_item_matrix,
    genre_matrix,
    alpha=0.7,
    k=50
):
    n_users = user_item_matrix.shape[0]
    if user_idx < 0 or user_idx >= n_users:
        return []

    user_items = user_item_matrix[user_idx]

    # ALS candidates
    item_ids, als_scores = als_model.recommend(
        userid=user_idx,
        user_items=user_items,
        N=k,
        filter_already_liked_items=True
    )

    if len(item_ids) == 0:
        return []

    # User genre profile (SUM, not mean)
    user_item_indices = user_items.indices
    if len(user_item_indices) == 0:
        return item_ids.tolist()

    user_genre_profile = genre_matrix[user_item_indices].sum(axis=0)

    if np.all(user_genre_profile == 0):
        return item_ids.tolist()

    # Normalize
    user_genre_profile /= (np.linalg.norm(user_genre_profile) + 1e-8)

    # Genre similarity
    genre_scores = genre_matrix[item_ids] @ user_genre_profile

    # Normalize both scores
    als_scores = (als_scores - als_scores.min()) / (als_scores.ptp() + 1e-8)
    genre_scores = (genre_scores - genre_scores.min()) / (genre_scores.ptp() + 1e-8)

    final_scores = alpha * als_scores + (1 - alpha) * genre_scores
    ranked_items = item_ids[np.argsort(-final_scores)]

    return ranked_items.tolist()


In [19]:
def evaluate_als_rerank_with_content(
    als_model,
    user_item_matrix,
    genre_matrix,
    test_interactions,
    alpha=0.7,
    k=50
):
    hits = 0
    ap_sum = 0

    for _, row in test_interactions.iterrows():
        user = row["user_idx"]
        true_item = row["movie_idx"]

        recs = recommend_als_rerank_with_content(
            user,
            als_model,
            user_item_matrix,
            genre_matrix,
            alpha=alpha,
            k=k
        )

        if true_item in recs:
            hits += 1
            rank = recs.index(true_item) + 1
            ap_sum += 1 / rank

    hit_rate = hits / len(test_interactions)
    map_k = ap_sum / len(test_interactions)

    return {
        "hit@k": hit_rate,
        "map@k": map_k
    }


In [21]:
hybrid_metrics = evaluate_als_rerank_with_content(
    als_model,
    item_user_train,
    genre_matrix,
    test_interactions,
    alpha=0.7,
    k=50
)

hybrid_metrics


{'hit@k': 0.0009943652635067948, 'map@k': 0.00010357313845878645}

In [22]:
for alpha in [0.3, 0.5, 0.7]:
    metrics = evaluate_als_rerank_with_content(
        als_model,
        item_user_train,
        genre_matrix,
        test_interactions,
        alpha=alpha,
        k=50
    )
    print(f"alpha={alpha} → {metrics}")


alpha=0.3 → {'hit@k': 0.0009943652635067948, 'map@k': 0.00010357313845878645}
alpha=0.5 → {'hit@k': 0.0009943652635067948, 'map@k': 0.00010357313845878645}
alpha=0.7 → {'hit@k': 0.0009943652635067948, 'map@k': 0.00010357313845878645}


In [23]:
sample_user = 0

rec_item_indices = recommend_als_rerank_with_content(
    sample_user,
    als_model,
    item_user_train,
    genre_matrix,
    alpha=0.7,
    k=10
)

rec_movie_ids = movie_encoder.inverse_transform(rec_item_indices)

movies_df = pd.read_csv(
    "../data/raw/movies.dat",
    sep="::",
    engine="python",
    encoding="latin-1",
    names=["movie_id", "title", "genres"]
)

movies_df[movies_df["movie_id"].isin(rec_movie_ids)][
    ["movie_id", "title", "genres"]
]


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller
