In [1]:
#Requirements: Numpy 1.12

import numpy as np
import pandas as pd

In [2]:
# Read dataset
df = pd.read_csv('./data/ml-small/ratings.csv')

n_users = df['userId'].unique().shape[0]
n_items = df['movieId'].unique().shape[0]
print("Number of unique users: %d" % n_users)
print("Number of unique movies: %d" % n_items)

Number of unique users: 671
Number of unique movies: 9066


In [3]:
# Create a dictionary from movieId to index
ind = 0
movie_dict = {}
movie_list = []

for item in df['movieId'].unique():
    movie_list.append(item)
    movie_dict[item] = ind
    ind += 1    

In [57]:
# Create user-item ratings matrix
ratings = np.zeros((n_users, n_items))

# df.itertuples() returns a Pandas Frame object
for row in df.itertuples():
    ratings[row[1] - 1, movie_dict[row[2]]] = row[3]

In [5]:
# Split data into training and test sets by removing 10 ratings per user from the training set and adding to test set 
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

In [29]:
# Method to compute cosine similarity between users / between items 
def similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [14]:
non_zero = np.nonzero(ratings[:5,:5])
print(ratings[:5,:5])
# print(non_zero)
# print((ratings[:5,0] != 0).astype(int))
print(np.count_nonzero(ratings[:5,:5],axis=0))
print(np.__version__)
print(ratings.shape)

[[-0.67857143 -0.70238095 -0.54545455 -1.3125     -0.26086957]
 [-0.         -0.         -0.         -0.         -0.        ]
 [-0.         -0.         -0.         -0.         -0.        ]
 [-0.         -0.         -0.         -0.         -0.        ]
 [-0.         -0.         -0.         -0.         -0.        ]]
[1 1 1 1 1]
1.12.1
(671, 9066)


In [17]:
# Methods to compute item-item centered cosine similarity matrix given the user-item ratings matrix
def centered_cosine_similarity(ratings, epsilon=1e-9):
    centered_ratings = np.zeros(ratings.shape)
    for j in range(ratings.shape[1]):
        # Add epsilon in denominator for numerical stability (in case all ratings are zero for some item)
        avg = np.sum(ratings[:,j]) / (np.count_nonzero(ratings[:,j]) + epsilon)
        non_zero = (ratings[:,j] != 0).astype(int) 
        centered_ratings[:,j] = ratings[:,j] - avg 
        centered_ratings[:,j] = ratings[:,j] * non_zero 
        
    sim = centered_ratings.T.dot(centered_ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

def vec_centered_cosine_similarity(ratings, epsilon=1e-9):
    # Add epsilon in denominator for numerical stability (in case all ratings are zero for some item)       
    avg = np.sum(ratings,axis=0,dtype=np.float64) / (np.count_nonzero(ratings, axis=0) + epsilon)
    non_zero = (ratings != 0).astype(int)
    centered_ratings = ratings - avg.T
    centered_ratings *= non_zero
    #print(centered_ratings[:5,:5])
    
    sim = centered_ratings.T.dot(centered_ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [18]:
item_sim = centered_cosine_similarity(ratings)

In [23]:
%timeit centered_cosine_similarity(ratings)

1 loop, best of 3: 2.95 s per loop


In [19]:
item_sim_vec = vec_centered_cosine_similarity(ratings)

In [24]:
%timeit vec_centered_cosine_similarity(ratings)

1 loop, best of 3: 2.59 s per loop


In [22]:
# Check if the similarity matrices computed using both non-vectorized and vectorized methods is the same
import numpy.linalg as linalg
diff_mat = item_sim - item_sim_vec
diff_norm = linalg.norm(item_sim) - linalg.norm(item_sim_vec)
print(diff_mat[:10,:10])
print(diff_norm)

[[  0.00000000e+00   0.00000000e+00  -1.38777878e-17  -1.38777878e-17
   -3.12250226e-17  -4.16333634e-17  -2.77555756e-17  -2.77555756e-17
   -1.64798730e-17   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   2.77555756e-17
   -4.16333634e-17  -4.16333634e-17   4.16333634e-17   6.93889390e-18
    2.42861287e-17   6.93889390e-18]
 [ -2.08166817e-17   2.77555756e-17   0.00000000e+00   0.00000000e+00
    0.00000000e+00   2.08166817e-17   0.00000000e+00  -1.73472348e-17
    5.20417043e-18   6.93889390e-18]
 [ -1.38777878e-17   1.38777878e-17   0.00000000e+00   0.00000000e+00
    2.77555756e-17  -5.55111512e-17   0.00000000e+00  -5.55111512e-17
   -2.77555756e-17   0.00000000e+00]
 [ -2.77555756e-17  -4.16333634e-17   1.38777878e-17   2.77555756e-17
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -3.46944695e-17
    3.46944695e-17   4.16333634e-17]
 [ -2.77555756e-17  -6.93889390e-17   2.77555756e-17  -5.55111512e-17
    0.00000000e+00  -1.11022302e-16   0.00000

In [34]:
# Get the indices of the zero elements in the array
zero_indices = np.where(ratings[2,:] == 0.)[0]
print(zero_indices.shape)


(9017,)
0


In [50]:
top_k = np.argsort(item_sim_vec[0,:])[-50:]
print(top_k)

# Indices of movies among top k similar movies for which the user has provided ratings
print(top_k[np.nonzero(ratings[5][top_k])[0]])

[ 707 1538 4951 1070 6505 3264 2524 3678 2703 5598 7402 7155 7314 5888 4716
 2898 5571 6984 3190 5899 3748  390 1623 3012 4988 2848 1754 1046 3752 2431
 2339  338 3774 1722  657 2675 3709 3900 1494 5496 5685  273 4291  558 4031
 1602 1417 6300  422    0]
[390]


In [51]:
# Method to retrieve recommendations for a user given the userId

def get_recommendations(user_id, k=100,epsilon=1e-9):
    pred = np.zeros((ratings.shape[1]))
    user_ratings = ratings[user_id,:]
    zero_indices = np.where(user_ratings == 0.)[0]
    for j in zero_indices:
        # Retrieve indices of top-k similar movies to this movie
        top_k_items = np.argsort(item_sim_vec[j,:])[-k:]
        
        top_k_with_non_zero = np.sum(item_sim_vec[j,:][top_k_items[np.nonzero(ratings[user_id][top_k_items])[0]]]) + epsilon
        
        pred[j] = user_ratings[top_k_items].dot(item_sim_vec[:,j][top_k_items])
        pred[j] /= top_k_with_non_zero 
     
    #Return the top 20 recommendations for this user. Change the 20 to be a parameter to this method
    return np.argsort(pred)[-20:]

get_recommendations(0)
        

array([3064, 2335, 2911, 2878,  546, 2684, 3233,  293, 2630, 5137, 4438,
       3158, 3671, 4826, 5526, 5720, 3136, 3142, 5553, 5549])

In [58]:
# Add Global Baseline
def get_global_baseline(user_id, epsilon=1e-9):
    global_avg = np.sum(ratings) / np.count_nonzero(ratings) 
    #print(global_avg)
    
    user_avg = np.sum(ratings[user_id,:]) / (np.count_nonzero(ratings[user_id,:]) + epsilon)
    user_bias = user_avg - global_avg
    #print(user_bias)
    
    movie_avg = np.sum(ratings, axis=0) / (np.count_nonzero(ratings,axis=0) + epsilon)
    movie_bias = movie_avg - global_avg
    #print(movie_bias[20:50])
    
    pred = np.zeros((ratings.shape[1]))
    zero_indices = np.where(ratings[user_id,:] == 0.)[0]
    for j in zero_indices:
        pred[j] = global_avg + movie_bias[j] + user_bias
        
    return pred    

pred = get_global_baseline(0)
print(pred[:100])

# Adjust for bias of user 

3.54360825567
-0.993608255797
[-0.09278858  0.38081035  0.00639174  0.49121761  0.82703851  0.09364665
  0.14604692  0.40156718 -0.21668518  0.35889174 -0.76066252  0.28764174
 -0.09994628 -0.42732919 -0.44066708 -0.66860826 -0.79139587  0.38820993
  0.4196945  -0.37053133  0.3602379  -0.3897621  -0.14760826  0.17207802
  0.335424   -0.05730689  0.45639174 -0.41397863 -0.23451735  0.71256458]
[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  2.45721142  2.93081035  2.55639174  3.04121761  3.37703851  2.64364665
  2.69604692  2.95156718  2.33331482  2.90889174  1.78933748  2.83764174
  2.45005372  2.12267081  2.10933292  1.88139174  1.75860413  2.93820993
  2.9696945   2.17946867  2.9102379   2.1602379   2.40239174  2.72207802
  2.885424    2.49269311  3.00639174  2.13602137  2.31548265  3.26256458
  2.75639174  2.9475