In [1]:
import numpy as np
import pandas as pd
import joblib

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load artifacts created in Notebook 02
interaction_matrix = joblib.load("../data/processed/interaction_matrix.pkl")
interactions = pd.read_csv("../data/processed/interactions.csv")

interaction_matrix.shape


(3125, 6034)

In [3]:
test_interactions = (
    interactions
    .groupby("user_idx", group_keys=False)
    .apply(lambda x: x.sample(1, random_state=42))
)

train_interactions = interactions.drop(test_interactions.index)

len(train_interactions), len(test_interactions)


(568342, 6034)

In [4]:
train_matrix = csr_matrix(
    (
        np.ones(len(train_interactions)),
        (train_interactions["movie_idx"], train_interactions["user_idx"])
    ),
    shape=interaction_matrix.shape
)

train_matrix


<3125x6034 sparse matrix of type '<class 'numpy.float64'>'
	with 568342 stored elements in Compressed Sparse Row format>

In [5]:
item_similarity = cosine_similarity(
    train_matrix,
    dense_output=False
)

item_similarity.shape


(3125, 3125)

In [6]:
def recommend_movies_item_based(
    user_idx,
    train_matrix,
    item_similarity,
    top_k=10
):
    # User interaction vector: (movies × 1)
    user_vector = train_matrix[:, user_idx]

    # Score all movies
    scores = item_similarity.dot(user_vector).toarray().ravel()

    # Remove already interacted movies
    interacted_items = user_vector.nonzero()[0]
    scores[interacted_items] = -1

    # Top-K movie indices
    return np.argsort(scores)[::-1][:top_k]


In [7]:
def hit_rate_at_k(train_matrix, test_interactions, item_similarity, k=10):
    hits = 0

    for _, row in test_interactions.iterrows():
        user = row["user_idx"]
        true_item = row["movie_idx"]

        recs = recommend_movies_item_based(
            user, train_matrix, item_similarity, top_k=k
        )

        if true_item in recs:
            hits += 1

    return hits / len(test_interactions)


In [8]:
def recall_at_k(train_matrix, test_interactions, item_similarity, k=10):
    recall_sum = 0

    for _, row in test_interactions.iterrows():
        user = row["user_idx"]
        true_item = row["movie_idx"]

        recs = recommend_movies_item_based(
            user, train_matrix, item_similarity, top_k=k
        )

        recall_sum += int(true_item in recs)

    return recall_sum / len(test_interactions)


In [9]:
def map_at_k(train_matrix, test_interactions, item_similarity, k=10):
    ap_sum = 0

    for _, row in test_interactions.iterrows():
        user = row["user_idx"]
        true_item = row["movie_idx"]

        recs = recommend_movies_item_based(
            user, train_matrix, item_similarity, top_k=k
        )

        if true_item in recs:
            rank = np.where(recs == true_item)[0][0] + 1
            ap_sum += 1 / rank

    return ap_sum / len(test_interactions)


In [10]:
hit_10 = hit_rate_at_k(train_matrix, test_interactions, item_similarity, k=10)
recall_10 = recall_at_k(train_matrix, test_interactions, item_similarity, k=10)
map_10 = map_at_k(train_matrix, test_interactions, item_similarity, k=10)

hit_10, recall_10, map_10


(0.16506463374212793, 0.16506463374212793, 0.07333285982606498)

In [11]:
movie_encoder = joblib.load("../models/movie_encoder.pkl")

sample_user = 0

recommended_item_indices = recommend_movies_item_based(
    sample_user,
    train_matrix,
    item_similarity,
    top_k=10
)

recommended_movie_ids = movie_encoder.inverse_transform(
    recommended_item_indices
)


In [12]:
movies = pd.read_csv(
    "../data/raw/movies.dat",
    sep="::",
    engine="python",
    encoding="latin-1",
    names=["movie_id", "title", "genres"]
)

movies[movies["movie_id"].isin(recommended_movie_ids)][
    ["movie_id", "title", "genres"]
]


Unnamed: 0,movie_id,title,genres
315,318,"Shawshank Redemption, The (1994)",Drama
360,364,"Lion King, The (1994)",Animation|Children's|Musical
589,593,"Silence of the Lambs, The (1991)",Drama|Thriller
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1179,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
1180,1198,Raiders of the Lost Ark (1981),Action|Adventure
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
1245,1265,Groundhog Day (1993),Comedy|Romance
2502,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
2647,2716,Ghostbusters (1984),Comedy|Horror


In [13]:
joblib.dump(item_similarity, "../models/item_similarity.pkl")


['../models/item_similarity.pkl']

#  Baseline Limitations

No latent factors

No user embeddings

Poor cold-start handling

O(n²) item similarity cost

Used only as baseline reference