In [25]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse

In [29]:
# Primero creamos el dataframe con los datos
df_train = pd.read_csv('dataset/train.csv',
                         sep=',',
                         names = ['user_id','item_id','rating'],
                         header=None)

# rating >= 5 , relevante (1) y rating menor a 5 es no relevante (0)
df_train.rating = [1 if x >=5 else 0 for x in df_train.rating ]

In [30]:
df_train.head()

Unnamed: 0,user_id,item_id,rating
0,35757,79,1
1,31006,8795,1
2,68084,14837,1
3,20881,1536,1
4,25996,1241,1


In [31]:
# Cargamos el dataset con los items
df_items = pd.read_csv('dataset/anime.csv',
                        sep=',',
                        names = ['anime_id','name','genre','type','episodes','rating','members'],
                        header=None,
                        encoding='latin-1')

In [32]:
df_items.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,GintamaÂ°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [33]:
# Cargamos el dataset de testing
df_test = pd.read_csv('dataset/validation.csv',
                      sep=',',
                      names=['userid', 'itemid', 'rating'],
                      header=None)


# rating >= 5 es relevante (1) y rating menor a 5 es no relevante (0)
df_test.rating = [1 if x >=5 else 0 for x in df_test.rating ]


user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

In [34]:
df_test.head()

Unnamed: 0,userid,itemid,rating
0,25810,4789,1
1,39393,6275,1
2,2282,8675,1
3,18797,237,1
4,39248,2472,1


In [35]:
# Definicion de métricas (No editar)
# Inspirado parcialmente en https://gist.github.com/bwhite/3726239

def precision_at_k(r, k):
    assert 1 <= k <= r.size
    return (np.asarray(r)[:k] != 0).mean()

def average_precision_at_k(r, k):
    r = np.asarray(r)
    score = 0.
    for i in range(min(k, r.size)):
        score += precision_at_k(r, i + 1)
    return score / k

def dcg_at_k(r, k):
    r = np.asarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)

    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

In [36]:
# Preprocesamiento de los datos a formato sparse
# Change data to map index from 0 
unique_user_ids = df_train['user_id'].unique()
unique_item_ids = df_train['item_id'].unique()

user_id_map = {uid: idx for idx, uid in enumerate(unique_user_ids)}
item_id_map = {iid: idx for idx, iid in enumerate(unique_item_ids)}

rows = []
cols = []
data = []

for row in df_train.itertuples(index=False):
    user_idx = user_id_map[row.user_id]
    item_idx = item_id_map[row.item_id]

    rows.append(user_idx)
    cols.append(item_idx)
    data.append(1) 

user_item_matrix = sparse.csr_matrix((data, (rows, cols)), shape=(len(user_id_map), len(item_id_map)))

In [37]:
def evaluate_model(model, n):
  mean_ap = 0. # o MAP
  mean_ndcg = 0.
  for u in user_items_test.keys():
      if u not in user_id_map:
          continue  # Skip users not in the training set

      u_idx = user_id_map[u]  # Convert raw user ID to matrix index

      rec = model.recommend(u_idx, user_item_matrix[u_idx], n)[0]
      rel_vector = [np.isin(user_items_test[u], rec, assume_unique=True).astype(int)]

      mean_ap += average_precision_at_k(rel_vector, n)
      mean_ndcg += ndcg_at_k(rel_vector, n)

  mean_ap /= len(user_items_test)
  mean_ndcg /= len(user_items_test)

  return mean_ap, mean_ndcg

def show_recommendations(model, user, n):
  recommendations = model.recommend(userid=user, user_items=user_item_matrix[user], N=n)[0]
  return df_items.loc[recommendations]['name']

def show_similar_movies(model, item, n=10):
  sim_items = model.similar_items(item, n)[0]
  return df_items.loc[sim_items]['name']

In [38]:
# Definimos y entrenamos el modelo con optimización ALS
model_als = implicit.als.AlternatingLeastSquares(factors=100, iterations=10, use_gpu=False)
model_als.fit(user_item_matrix)

  0%|          | 0/10 [00:00<?, ?it/s]

In [41]:
show_recommendations(model_als, user=99, n=10)

33                   Mushishi Zoku Shou: Suzu no Shizuku
948              Detective Conan Movie 09: Promo Special
319                                       Kimi ni Todoke
21     Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
977                             Initial D Battle Stage 2
981                                      Mousou Dairinin
921                               Tenchi Muyou! Ryououki
753    One Piece: Episode of Luffy - Hand Island no B...
861                                Angel Beats! Specials
55          Tengen Toppa Gurren Lagann Movie: Lagann-hen
Name: name, dtype: object

In [45]:
show_similar_movies(model_als, item=1, n=10)

1                        Fullmetal Alchemist: Brotherhood
810     Detective Conan: Conan vs. Kid - Shark &amp; J...
1491                          Future GPX Cyber Formula 11
1130                                         Aldnoah.Zero
2866                          30th Gundam Perfect Mission
840         Mobile Suit Gundam: Char&#039;s Counterattack
1310                                 Hikaru no Go Special
633                                         Paradise Kiss
3190                              Zettai Muteki Raijin-Oh
478                                   Durarara!! Specials
Name: name, dtype: object

In [47]:
maprec, ndcg = evaluate_model(model_als, n=10)
print('map: {}\nndcg: {}'.format(maprec, ndcg))

map: 0.00036101083032490973
ndcg: 0.0036101083032490976
