In [3]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse

In [4]:
anime_ratings_df = pd.read_csv("rating.csv")
anime_ratings_df.shape
print(anime_ratings_df.head())

   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1


In [5]:
anime_ratings = anime_ratings_df.loc[anime_ratings_df.rating != -1].reset_index()[['user_id','anime_id','rating']]
print(anime_ratings.shape)
anime_ratings.head()

(6337241, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10


In [6]:
train_df, valid_df = train_test_split(anime_ratings, test_size=0.2)

In [7]:
#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['user_id', 'anime_id', 'rating']]
valid_df = valid_df.reset_index()[['user_id', 'anime_id', 'rating']]

In [8]:
def col_encode(column):
	keys = column.unique()
	key_to_id = {key:idx for idx,key in enumerate(keys)}
	return key_to_id , np.array([key_to_id[x] for x in column]) , len(keys)

In [9]:
def df_encode(anime_df):
    anime_ids, anime_df['anime_id'], num_anime = col_encode(anime_df['anime_id'])
    user_ids, anime_df['user_id'], num_users = col_encode(anime_df['user_id'])
    return anime_df, num_users, num_anime, user_ids, anime_ids	

In [10]:
anime_df, num_users, num_anime, user_ids, anime_ids = df_encode(train_df)
print("Number of users :", num_users)
print("Number of anime :", num_anime)
anime_df.head()


Number of users : 68875
Number of anime : 9723


Unnamed: 0,user_id,anime_id,rating
0,0,0,6
1,1,1,7
2,2,2,8
3,3,3,9
4,4,4,8


In [11]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

In [12]:
def create_sparse_matrix(df, rows, cols, column_name="rating"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['user_id'].values, df['anime_id'].values)),shape=(rows, cols))


In [13]:
anime_df, num_users, num_anime, user_ids, anime_ids = df_encode(train_df)
Y = create_sparse_matrix(anime_df, num_users, num_anime)

In [14]:
def predict(df, emb_user, emb_anime):
    """ This function computes df["prediction"] without doing (U*V^T).
    
    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_anime[df['anime_id']],emb_user[df['user_id']]), axis=1)
    return df

In [15]:
lmbda = 0.0002

def cost(df, emb_user, emb_anime):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

In [16]:
def gradient(df, emb_user, emb_anime):
    """ Computes the gradient for user and anime embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_anime) + 2*lmbda*emb_user
    grad_anime = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_anime
    return grad_user, grad_anime

In [17]:
def gradient_descent(df, emb_user, emb_anime, iterations=2000, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_anime: the trained anime embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    beta = 0.9
    grad_user, grad_anime = gradient(df, emb_user, emb_anime)
    v_user = grad_user
    v_anime = grad_anime
    for i in range(iterations):
        grad_user, grad_anime = gradient(df, emb_user, emb_anime)
        v_user = beta*v_user + (1-beta)*grad_user
        v_anime = beta*v_anime + (1-beta)*grad_anime
        emb_user = emb_user - learning_rate*v_user
        emb_anime = emb_anime - learning_rate*v_anime
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_anime))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_anime))
    return emb_user, emb_anime

In [18]:
emb_user = create_embeddings(num_users, 3)
emb_anime = create_embeddings(num_anime, 3)
emb_user, emb_anime = gradient_descent(anime_df, emb_user, emb_anime, iterations=800, learning_rate=1)


iteration 50 :
train mse: 16.10204818884032

iteration 100 :
train mse: 12.336755412873643

iteration 150 :
train mse: 10.535865352143981

iteration 200 :
train mse: 9.443781790450906

iteration 250 :
train mse: 8.69905493140073

iteration 300 :
train mse: 8.153074465633503

iteration 350 :
train mse: 7.732698746373368

iteration 400 :
train mse: 7.397569639202399

iteration 450 :
train mse: 7.123558796923377

iteration 500 :
train mse: 6.895348985925025

iteration 550 :
train mse: 6.702748323027263

iteration 600 :
train mse: 6.538710619291066

iteration 650 :
train mse: 6.398204897737338

iteration 700 :
train mse: 6.277537099028845


KeyboardInterrupt: 

In [None]:
def encode_new_data(valid_df, user_ids, anime_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid_df['anime_id'].isin(anime_ids.keys()) & valid_df['user_id'].isin(user_ids.keys())
    valid_df = valid_df[df_val_chosen]
    valid_df['anime_id'] =  np.array([anime_ids[x] for x in valid_df['anime_id']])
    valid_df['user_id'] = np.array([user_ids[x] for x in valid_df['user_id']])
    return valid_df

In [None]:
print("before encoding:", valid_df.shape)
valid_df = encode_new_data(valid_df, user_ids, anime_ids)
print("after encoding:", valid_df.shape)

before encoding: (751207, 4)
after encoding: (751207, 4)


In [None]:
train_mse = cost(train_df, emb_user, emb_anime)
val_mse = cost(valid_df, emb_user, emb_anime)
print(train_mse, val_mse)

6.079117082515941 12.274269579755632


In [None]:
valid_df[70:80].head()

Unnamed: 0,user_id,anime_id,rating,prediction
120,49369,899,7,9.305994
124,10858,2899,5,6.336065
125,39831,8424,8,2.694225
126,50888,101,9,11.457634
127,65836,5177,7,10.997614
