In [54]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse
from collections import Counter
import math

In [55]:
# Primero creamos el dataframe con los datos
df_train = pd.read_csv('dataset/train.csv',
                         sep=',',
                         names = ['user_id','item_id','rating'],
                         header=0)

# rating >= 5 , relevante (1) y rating menor a 5 es no relevante (0)
df_train.rating = [1 if x >=5 else 0 for x in df_train.rating ]

In [56]:
df_train.head()

Unnamed: 0,user_id,item_id,rating
0,40748,9926,0
1,35757,79,1
2,18266,51,0
3,31006,8795,1
4,68084,14837,1


In [57]:
# Cargamos el dataset con los items
df_items = pd.read_csv('dataset/anime.csv',
                        sep=',',
                        names = ['anime_id','name','genre','type','episodes','rating','members'],
                        header=0)

In [58]:
df_items.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [59]:
# Cargamos el dataset de testing
df_test = pd.read_csv('dataset/validation.csv',
                      sep=',',
                      names=['userid', 'itemid', 'rating'],
                      header=0)


# rating >= 5 es relevante (1) y rating menor a 5 es no relevante (0)
df_test.rating = [1 if x >=5 else 0 for x in df_test.rating ]


user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

In [60]:
df_test.head()

Unnamed: 0,userid,itemid,rating
0,25810,4789,1
1,39393,6275,1
2,2282,8675,1
3,18797,237,1
4,39248,2472,1


In [61]:
# Definicion de métricas (No editar)
# Inspirado parcialmente en https://gist.github.com/bwhite/3726239

def precision_at_k(r, k):
    assert 1 <= k <= r.size
    return (np.asarray(r)[:k] != 0).mean()

def average_precision_at_k(r, k):
    r = np.asarray(r)
    score = 0.
    for i in range(min(k, r.size)):
        score += precision_at_k(r, i + 1)
    return score / k

def dcg_at_k(r, k):
    r = np.asarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)

    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def recall_at_k(r, k):
    return np.sum(np.asarray(r)[:k] != 0) / min(k, len(r))

def diversity(recommendations):
    all_recs = [item for recs in recommendations.values() for item in recs]
    unique_items = set(all_recs)
    total_recs = len(all_recs)
    return len(unique_items) / total_recs if total_recs > 0 else 0

def calculate_novelty(recommendations, train_df=df_train):
    """Calcula la novelty como el promedio de -log(popularidad) de los ítems recomendados."""
    item_counts = Counter(train_df['item_id'])
    total_interactions = sum(item_counts.values())

    novelty_scores = []
    for user, rec_items in recommendations.items():
        for item in rec_items:
            count = item_counts.get(item, 1)
            prob = count / total_interactions
            novelty = -math.log(prob)
            novelty_scores.append(novelty)

    return np.mean(novelty_scores) if novelty_scores else 0


In [62]:
# Preprocesamiento de los datos a formato sparse
# Change data to map index from 0 
unique_user_ids = df_train['user_id'].unique()
unique_item_ids = df_train['item_id'].unique()

user_id_map = {uid: idx for idx, uid in enumerate(unique_user_ids)}
item_id_map = {iid: idx for idx, iid in enumerate(unique_item_ids)}

rows = []
cols = []
data = []

for row in df_train.itertuples(index=False):
    user_idx = user_id_map[row.user_id]
    item_idx = item_id_map[row.item_id]

    rows.append(user_idx)
    cols.append(item_idx)
    data.append(1) 

user_item_matrix = sparse.csr_matrix((data, (rows, cols)), shape=(len(user_id_map), len(item_id_map)))

In [63]:
def evaluate_model(model, n):
    mean_ap = 0.  
    mean_ndcg = 0.
    mean_recall = 0.
    recommendations = {}  

    for u in user_items_test.keys():
        if u not in user_id_map:
            continue  

        u_idx = user_id_map[u] 
        rec = model.recommend(u_idx, user_item_matrix[u_idx], n)[0]
        rec_items = [item for item in rec]  
        recommendations[u] = rec_items
        
        rel_vector = [np.isin(user_items_test[u], rec_items, assume_unique=True).astype(int)]

        mean_ap += average_precision_at_k(rel_vector, n)
        mean_ndcg += ndcg_at_k(rel_vector, n)
        mean_recall += recall_at_k(rel_vector, n)

    mean_ap /= len(user_items_test)
    mean_ndcg /= len(user_items_test)
    mean_recall /= len(user_items_test)

    diversity_score = diversity(recommendations)
    novelty_score = calculate_novelty(recommendations)

    return mean_ap, mean_ndcg, mean_recall, diversity_score, novelty_score

def show_recommendations(model, user, n):
  recommendations = model.recommend(userid=user, user_items=user_item_matrix[user], N=n)[0]
  return df_items.loc[recommendations]['name']

def show_similar_movies(model, item, n=10):
  sim_items = model.similar_items(item, n)[0]
  return df_items.loc[sim_items]['name']

In [79]:
# Definimos y entrenamos el modelo de implicit feedback utilizando optimizacion BPR, probamos primero con una iteracion rapida
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=100, iterations=10, use_gpu=False)
model_bpr.fit(user_item_matrix)

  0%|          | 0/10 [00:00<?, ?it/s]

In [80]:
show_recommendations(model_bpr, user=1, n=10)

127     Fate/stay night: Unlimited Blade Works 2nd Season
771                                        Romeo x Juliet
610                        Hidamari Sketch x ☆☆☆ Specials
103                                   Major: World Series
663                                     Master Keaton OVA
257          JoJo no Kimyou na Bouken: Stardust Crusaders
484               xxxHOLiC Movie: Manatsu no Yoru no Yume
275                                    Durarara!!x2 Ketsu
2641                           Fantastic Children Special
738                                   Soukou Kihei Votoms
Name: name, dtype: object

In [81]:
show_similar_movies(model_bpr, item=171, n=10)

171                            Gintama: Shiroyasha Koutan
123                                      Nodame Cantabile
127     Fate/stay night: Unlimited Blade Works 2nd Season
2641                           Fantastic Children Special
2232                                    Inazuma Eleven Go
2084                                       Mai-Otome Zwei
561                                      Lupin III (2015)
1414                                      Gunslinger Girl
257          JoJo no Kimyou na Bouken: Stardust Crusaders
1440                     Chouyaku Hyakuninisshu: Uta Koi.
Name: name, dtype: object

In [82]:
map_bpr, ndcg_bpr, recall_bpr, div_bpr, nov_bpr = evaluate_model(model_bpr, n=10)

print(f"MAP@10: {map_bpr:.8f}")
print(f"NDCG@10: {ndcg_bpr:.4f}")
print(f"Recall@10: {recall_bpr:.4f}")
print(f"Diversity: {div_bpr:.4f}")
print(f"Novelty: {nov_bpr:.4f}")

MAP@10: 0.00009857
NDCG@10: 0.0010
Recall@10: 0.0010
Diversity: 0.0005
Novelty: 9.9001


In [86]:
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=500, iterations=5, use_gpu=False)
model_bpr.fit(user_item_matrix)

  0%|          | 0/5 [00:00<?, ?it/s]

In [87]:
map_bpr, ndcg_bpr, recall_bpr, div_bpr, nov_bpr = evaluate_model(model_bpr, n=10)

print(f"MAP@10: {map_bpr:.8f}")
print(f"NDCG@10: {ndcg_bpr:.5f}")
print(f"Recall@10: {recall_bpr:.5f}")
print(f"Diversity: {div_bpr:.5f}")
print(f"Novelty: {nov_bpr:.5f}")

MAP@10: 0.00014786
NDCG@10: 0.00148
Recall@10: 0.00148
Diversity: 0.00059
Novelty: 8.77784


In [88]:
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=250, iterations=30, use_gpu=False)
model_bpr.fit(user_item_matrix)

  0%|          | 0/30 [00:00<?, ?it/s]

In [89]:
map_bpr, ndcg_bpr, recall_bpr, div_bpr, nov_bpr = evaluate_model(model_bpr, n=10)

print(f"MAP@10: {map_bpr:.8f}")
print(f"NDCG@10: {ndcg_bpr:.5f}")
print(f"Recall@10: {recall_bpr:.5f}")
print(f"Diversity: {div_bpr:.5f}")
print(f"Novelty: {nov_bpr:.5f}")

MAP@10: 0.00004929
NDCG@10: 0.00049
Recall@10: 0.00049
Diversity: 0.00054
Novelty: 9.79116
