In [1]:
import numpy as np
import pandas as pd
import joblib
from itertools import product

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares


In [2]:
interaction_matrix = joblib.load("../data/processed/interaction_matrix.pkl")
interactions = pd.read_csv("../data/processed/interactions.csv")


In [3]:
interaction_matrix.shape


(3125, 6034)

In [4]:
test_interactions = (
    interactions
    .groupby("user_idx", group_keys=False)
    .apply(lambda x: x.sample(min(10, len(x)), random_state=42))
    .reset_index(drop=True)
)

train_interactions = interactions.drop(test_interactions.index)


In [5]:
n_items = interactions["movie_idx"].max() + 1
n_users = interactions["user_idx"].max() + 1

train_matrix = csr_matrix(
    (
        np.ones(len(train_interactions)),
        (train_interactions["movie_idx"], train_interactions["user_idx"])
    ),
    shape=(n_items, n_users)
)

user_item_train = train_matrix.T.tocsr()


In [6]:
n_users_model = user_item_train.shape[0]

test_eval = test_interactions[
    test_interactions["user_idx"] < n_users_model
].copy()


In [7]:
def recommend_als(user_idx, model, user_item_matrix, k=50):
    n_users = user_item_matrix.shape[0]

    if user_idx < 0 or user_idx >= n_users:
        return []

    user_items = user_item_matrix[user_idx]

    item_ids, scores = model.recommend(
        userid=user_idx,
        user_items=user_items,
        N=k,
        filter_already_liked_items=True
    )

    return item_ids.tolist()


In [8]:
def hit_rate_at_k(model, user_item_matrix, test_data, k=50):
    hits = 0
    for _, row in test_data.iterrows():
        recs = recommend_als(row["user_idx"], model, user_item_matrix, k)
        hits += int(row["movie_idx"] in recs)
    return hits / len(test_data)


def recall_at_k(model, user_item_matrix, test_data, k=100):
    return hit_rate_at_k(model, user_item_matrix, test_data, k)


def map_at_k(model, user_item_matrix, test_data, k=50):
    ap_sum = 0
    for _, row in test_data.iterrows():
        recs = recommend_als(row["user_idx"], model, user_item_matrix, k)
        if row["movie_idx"] in recs:
            rank = recs.index(row["movie_idx"]) + 1
            ap_sum += 1 / rank
    return ap_sum / len(test_data)


In [9]:
param_grid = {
    "factors": [64, 128],
    "regularization": [0.01, 0.05],
    "alpha": [20, 40],
    "iterations": [20]
}


In [10]:
param_combinations = list(product(
    param_grid["factors"],
    param_grid["regularization"],
    param_grid["alpha"],
    param_grid["iterations"]
))


In [11]:
results = []

for factors, reg, alpha, iters in param_combinations:
    print(f"Training ALS | factors={factors}, reg={reg}, alpha={alpha}")

    model = AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=iters,
        random_state=42
    )

    model.fit(user_item_train * alpha)

    hit = hit_rate_at_k(model, user_item_train, test_eval, k=50)
    recall = recall_at_k(model, user_item_train, test_eval, k=100)
    mapk = map_at_k(model, user_item_train, test_eval, k=50)

    results.append({
        "factors": factors,
        "regularization": reg,
        "alpha": alpha,
        "hit@50": hit,
        "recall@100": recall,
        "map@50": mapk
    })


Training ALS | factors=64, reg=0.01, alpha=20


  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

Training ALS | factors=64, reg=0.01, alpha=40


  0%|          | 0/20 [00:00<?, ?it/s]

Training ALS | factors=64, reg=0.05, alpha=20


  0%|          | 0/20 [00:00<?, ?it/s]

Training ALS | factors=64, reg=0.05, alpha=40


  0%|          | 0/20 [00:00<?, ?it/s]

Training ALS | factors=128, reg=0.01, alpha=20


  0%|          | 0/20 [00:00<?, ?it/s]

Training ALS | factors=128, reg=0.01, alpha=40


  0%|          | 0/20 [00:00<?, ?it/s]

Training ALS | factors=128, reg=0.05, alpha=20


  0%|          | 0/20 [00:00<?, ?it/s]

Training ALS | factors=128, reg=0.05, alpha=40


  0%|          | 0/20 [00:00<?, ?it/s]

In [12]:
results_df = pd.DataFrame(results)
results_df.sort_values("map@50", ascending=False)


Unnamed: 0,factors,regularization,alpha,hit@50,recall@100,map@50
5,128,0.01,40,0.002926,0.004456,0.000361
4,128,0.01,20,0.002926,0.004456,0.000361
6,128,0.05,20,0.002926,0.004456,0.000361
7,128,0.05,40,0.002926,0.004456,0.000361
2,64,0.05,20,0.002926,0.004439,0.000359
0,64,0.01,20,0.002926,0.004439,0.000359
3,64,0.05,40,0.002926,0.004439,0.000358
1,64,0.01,40,0.002926,0.004439,0.000358


In [13]:
best_params = results_df.sort_values("map@50", ascending=False).iloc[0]
best_params


factors           128.000000
regularization      0.010000
alpha              40.000000
hit@50              0.002926
recall@100          0.004456
map@50              0.000361
Name: 5, dtype: float64

In [14]:
best_model = AlternatingLeastSquares(
    factors=int(best_params["factors"]),
    regularization=best_params["regularization"],
    iterations=20,
    random_state=42
)

best_model.fit(user_item_train * best_params["alpha"])


  0%|          | 0/20 [00:00<?, ?it/s]

In [15]:
joblib.dump(best_model, "../models/als_tuned_model.pkl")


['../models/als_tuned_model.pkl']