In [40]:
# TODO: directly install req here?
# pip install -r requirements.txt

In [41]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse

In [42]:
# TODO: Get dataset from external url

In [None]:
# Primero creamos el dataframe con los datos
df_train = pd.read_csv('dataset/train.csv',
                         sep=',',
                         names = ['user_id','item_id','rating'],
                         header=None)

# rating >= 5 , relevante (1) y rating menor a 5 es no relevante (0)
df_train.rating = [1 if x >=5 else 0 for x in df_train.rating ]

In [44]:
df_train.head()

Unnamed: 0,user_id,item_id,rating
0,20881,1536,1
1,25996,1241,1
2,25409,8668,1
3,29728,6325,1
4,71886,1887,1


In [45]:
# Cargamos el dataset con los items
df_items = pd.read_csv('dataset/anime.csv',
                        sep=',',
                        names = ['anime_id','name','genre','type','episodes','rating','members'],
                        header=None,
                        encoding='latin-1')

In [46]:
df_items.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
1,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
2,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
3,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
4,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679


In [47]:
# Cargamos el dataset de testing
df_test = pd.read_csv('dataset/validation.csv',
                      sep=',',
                      names=['userid', 'itemid', 'rating'],
                      header=None)


# rating >= 5 es relevante (1) y rating menor a 5 es no relevante (0)
df_test.rating = [1 if x >=5 else 0 for x in df_test.rating ]


user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

In [48]:
df_test.head()

Unnamed: 0,userid,itemid,rating
0,18797,237,1
1,39248,2472,1
2,30019,1575,1
3,6970,11737,1
4,7081,21273,1


In [49]:
# Definicion de métricas (No editar)
# Inspirado parcialmente en https://gist.github.com/bwhite/3726239

def precision_at_k(r, k):
    assert 1 <= k <= r.size
    return (np.asarray(r)[:k] != 0).mean()

def average_precision_at_k(r, k):
    r = np.asarray(r)
    score = 0.
    for i in range(min(k, r.size)):
        score += precision_at_k(r, i + 1)
    return score / k

def dcg_at_k(r, k):
    r = np.asarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)

    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

In [50]:
# Preprocesamiento de los datos a formato sparse
# Change data to map index from 0 
unique_user_ids = df_train['user_id'].unique()
unique_item_ids = df_train['item_id'].unique()

user_id_map = {uid: idx for idx, uid in enumerate(unique_user_ids)}
item_id_map = {iid: idx for idx, iid in enumerate(unique_item_ids)}

rows = []
cols = []
data = []

for row in df_train.itertuples(index=False):
    user_idx = user_id_map[row.user_id]
    item_idx = item_id_map[row.item_id]

    rows.append(user_idx)
    cols.append(item_idx)
    data.append(1) 

user_item_matrix = sparse.csr_matrix((data, (rows, cols)), shape=(len(user_id_map), len(item_id_map)))

In [51]:
def evaluate_model(model, n):
  mean_ap = 0. # o MAP
  mean_ndcg = 0.
  for u in user_items_test.keys():
      if u not in user_id_map:
          continue  # Skip users not in the training set

      u_idx = user_id_map[u]  # Convert raw user ID to matrix index

      rec = model.recommend(u_idx, user_item_matrix[u_idx], n)[0]
      rel_vector = [np.isin(user_items_test[u], rec, assume_unique=True).astype(int)]

      mean_ap += average_precision_at_k(rel_vector, n)
      mean_ndcg += ndcg_at_k(rel_vector, n)

  mean_ap /= len(user_items_test)
  mean_ndcg /= len(user_items_test)

  return mean_ap, mean_ndcg

def show_recommendations(model, user, n):
  recommendations = model.recommend(userid=user, user_items=user_item_matrix[user], N=n)[0]
  return df_items.loc[recommendations]['name']

def show_similar_movies(model, item, n=10):
  sim_items = model.similar_items(item, n)[0]
  return df_items.loc[sim_items]['name']

In [52]:
# Definimos y entrenamos el modelo de implicit feedback utilizando optimizacion BPR
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=100, iterations=10, use_gpu=False)
model_bpr.fit(user_item_matrix)

  0%|          | 0/10 [00:00<?, ?it/s]

In [53]:
show_recommendations(model_bpr, user=1, n=10)

527                                 Little Witch Academia
388                                      Durarara!!x2 Ten
839                                             New Game!
800                                           Shaman King
256                                 Non Non Biyori Repeat
2293                                    High School Fleet
658     Kino no Tabi: Nanika wo Suru Tame ni - Life Go...
487                                        Hanasaku Iroha
42                         Kara no Kyoukai 5: Mujun Rasen
899                               Rozen Maiden: TrÃ¤umend
Name: name, dtype: object

In [54]:
show_similar_movies(model_bpr, item=171, n=10)

171                             Kaguya-hime no Monogatari
1403    Code Geass: Boukoku no Akito 4 - Nikushimi no ...
1130    Detective Conan Magic File 2: Kudou Shinichi -...
256                                 Non Non Biyori Repeat
23                       Monogatari Series: Second Season
998                                            Mitsudomoe
34                        Kizumonogatari II: Nekketsu-hen
640                                        Ao no Exorcist
909                                         Fushigi Yuugi
1346                         Boku wa Tomodachi ga Sukunai
Name: name, dtype: object

In [None]:
maprec, ndcg = evaluate_model(model_bpr, n=10)
print('map: {}\nndcg: {}'.format(maprec, ndcg))

# pesimo map pero bueno xd ahi vemos fue iteracion rapida

map: 0.00018083182640144668
ndcg: 0.0018083182640144665
