# FM sample

In [None]:
%pip install lightfm #una sola vez

In [1]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k

print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))

System version: 3.8.10 (default, Jul 29 2024, 17:02:10) 
[GCC 9.4.0]
LightFM version: 1.17


# LOAD csv

In [6]:
df_train = pd.read_csv("dataset/train.csv")
df_validation = pd.read_csv("dataset/validation.csv")
df_genres = pd.read_csv("dataset/anime_genres.csv")

# Ignore the -1's ratings
df_train_implicit = df_train[df_train['rating'] != -1].copy()

# Sample
df_train_sample = df_train_implicit.sample(5000, random_state=42)

# Ignore the -1's inside the validation df
df_validation = df_validation[df_validation['rating'] != -1]


In [8]:
from lightfm.data import Dataset

# Dataset with features, anime genres = item features
dataset = Dataset()
dataset.fit(df_train_sample['user_id'], df_train_sample['anime_id'])
dataset.fit_partial(items=df_genres['anime_id'], item_features=df_genres.columns[1:].tolist())


In [9]:
# Implicit interactions
(interactions, _) = dataset.build_interactions([
    (row['user_id'], row['anime_id']) for _, row in df_train_sample.iterrows()
])

# anime features
anime_features = []
for _, row in df_genres.iterrows():
    genres = [genre for genre in df_genres.columns[1:] if row[genre] == 1]
    anime_features.append((row['anime_id'], genres))

item_features = dataset.build_item_features(anime_features)


In [10]:
from lightfm import LightFM

# training with warp as the loss function and arbitrary first parameters
model = LightFM(loss='warp', no_components=30, random_state=42)
model.fit(interactions, item_features=item_features, epochs=10, num_threads=4)


<lightfm.lightfm.LightFM at 0x7f049449dc70>

In [11]:
from lightfm.evaluation import precision_at_k

precision = precision_at_k(model, interactions, item_features=item_features, k=10).mean()
print(f"Precision@10 (train sample): {precision:.4f}")


Precision@10 (train sample): 0.0389


# Evaluation on df_validation

In [12]:
# valid users and items inside the sample
valid_users = df_validation['user_id'].isin(df_train_sample['user_id'])
valid_items = df_validation['anime_id'].isin(df_train_sample['anime_id'])
df_val_filtered = df_validation[valid_users & valid_items]

# interactions
(interactions_val, _) = dataset.build_interactions([
    (row['user_id'], row['anime_id']) for _, row in df_val_filtered.iterrows()
])


In [15]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import ndcg_score, average_precision_score

def get_recommendations(model, user_id, known_items, item_features, N=10):
    n_users, n_items = interactions.shape
    scores = model.predict(user_id, np.arange(n_items), item_features=item_features)
    scores[known_items] = -np.inf  # do not recommend already watched
    top_items = np.argsort(-scores)[:N]
    return top_items

def get_user_index_map():
    user_map, _, _ = dataset.mapping()[0]
    return {v: k for k, v in user_map.items()}

def get_item_index_map():
    item_map, _, _ = dataset.mapping()[2]
    return {v: k for k, v in item_map.items()}

# structures 
user_id_map = dataset.mapping()[0]
item_id_map = dataset.mapping()[2]
reverse_user_map = {v: k for k, v in user_id_map.items()}
reverse_item_map = {v: k for k, v in item_id_map.items()}

# ground truth per user
from collections import defaultdict

val_truth = defaultdict(set)
for _, row in df_val_filtered.iterrows():
    val_truth[user_id_map[row['user_id']]].add(item_id_map[row['anime_id']])

# metrics
recall_list, map_list, ndcg_list = [], [], []
k = 10

for user in tqdm(val_truth):
    relevant = np.zeros(len(item_id_map))
    for item in val_truth[user]:
        relevant[item] = 1

    # get recommendations
    known_items = interactions.tocsr()[user].nonzero()[1]
    preds = model.predict(user, np.arange(len(item_id_map)), item_features=item_features)
    preds[known_items] = -np.inf
    top_k = np.argsort(-preds)[:k]

    # binary ground truth for top_k
    hits = [1 if i in val_truth[user] else 0 for i in top_k]
    recall_list.append(np.sum(hits) / len(val_truth[user]))
    # Ensure a float number instead of minus infinite
    preds_safe = np.copy(preds)
    preds_safe[~np.isfinite(preds_safe)] = -1e9

    map_list.append(average_precision_score(relevant, preds_safe))
    ndcg_list.append(ndcg_score([relevant], [preds_safe]))


print(f"Recall@{k}: {np.mean(recall_list):.4f}")
print(f"MAP@{k}: {np.mean(map_list):.4f}")
print(f"nDCG@{k}: {np.mean(ndcg_list):.4f}")


100%|██████████| 401/401 [00:07<00:00, 56.13it/s]

Recall@10: 0.0075
MAP@10: 0.0050
nDCG@10: 0.0983





In [17]:
from math import log

# popularity (for novelty)
anime_counts = df_train_sample['anime_id'].value_counts().to_dict()
total_users = df_train_sample['user_id'].nunique()
anime_popularity = {
    item_id_map[aid]: count / total_users
    for aid, count in anime_counts.items()
    if aid in item_id_map
}

# calc
diversity_list = []
novelty_list = []

for user in tqdm(val_truth):
    known_items = interactions.tocsr()[user].nonzero()[1]
    preds = model.predict(user, np.arange(len(item_id_map)), item_features=item_features)
    preds[known_items] = -np.inf
    top_k = np.argsort(-preds)[:k]

    # diversity (unique genres)
    genres_reco = set()
    for item in top_k:
        anime_id = reverse_item_map[item]
        anime_row = df_genres[df_genres['anime_id'] == anime_id]
        if not anime_row.empty:
            for col in df_genres.columns[1:]:
                if anime_row.iloc[0][col] == 1:
                    genres_reco.add(col)
    diversity_list.append(len(genres_reco))

    # Novelty
    novelty = 0
    for item in top_k:
        pop = anime_popularity.get(item, 1e-6)
        novelty += log(1 / pop)
    novelty_list.append(novelty / k)

print(f"Diversity (prom.): {np.mean(diversity_list):.2f}")
print(f"Novelty (prom.): {np.mean(novelty_list):.4f}")


100%|██████████| 401/401 [00:07<00:00, 56.15it/s]

Diversity (prom.): 6.16
Novelty (prom.): 8.6253





In [18]:
# Complete dataset
df_full = df_train[df_train['rating'] != -1].copy()

# dataset modified to use in lightFM
dataset = Dataset()
dataset.fit(df_full['user_id'], df_full['anime_id'])
dataset.fit_partial(items=df_genres['anime_id'], item_features=df_genres.columns[1:].tolist())

# interactions
(interactions_full, _) = dataset.build_interactions([
    (row['user_id'], row['anime_id']) for _, row in df_full.iterrows()
])

# features
anime_features = []
for _, row in df_genres.iterrows():
    genres = [genre for genre in df_genres.columns[1:] if row[genre] == 1]
    anime_features.append((row['anime_id'], genres))

item_features_full = dataset.build_item_features(anime_features)


In [19]:
model_full = LightFM(loss='warp', no_components=30, random_state=42)
model_full.fit(interactions_full, item_features=item_features_full, epochs=10, num_threads=4)


<lightfm.lightfm.LightFM at 0x7f0491b7f490>