In [1]:
import numpy as np
import pandas as pd

In [3]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)

In [4]:
len(df)

100000

In [6]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [8]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [9]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [25]:
train_data_matrix

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [26]:
test_data_matrix

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [11]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [35]:
user_similarity

array([[ 0.        ,  0.88058613,  0.97405218, ...,  0.86389296,
         0.8545065 ,  0.65384201],
       [ 0.88058613,  0.        ,  0.9171202 , ...,  0.85505724,
         0.84925412,  0.94471357],
       [ 0.97405218,  0.9171202 ,  0.        , ...,  0.97340416,
         0.88145429,  0.98503253],
       ..., 
       [ 0.86389296,  0.85505724,  0.97340416, ...,  0.        ,
         0.85885207,  0.92652662],
       [ 0.8545065 ,  0.84925412,  0.88145429, ...,  0.85885207,
         0.        ,  0.85297649],
       [ 0.65384201,  0.94471357,  0.98503253, ...,  0.92652662,
         0.85297649,  0.        ]])

In [36]:
item_similarity

array([[ 0.        ,  0.68173569,  0.71652296, ...,  1.        ,
         0.94745024,  1.        ],
       [ 0.68173569,  0.        ,  0.81520479, ...,  1.        ,
         0.90670444,  1.        ],
       [ 0.71652296,  0.81520479,  0.        , ...,  1.        ,
         1.        ,  1.        ],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         1.        ,  1.        ],
       [ 0.94745024,  0.90670444,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  0.        ]])

In [14]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [19]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [27]:
item_prediction

array([[ 0.34707668,  0.3668611 ,  0.3824056 , ...,  0.43069601,
         0.42136223,  0.43069601],
       [ 0.09171874,  0.10588227,  0.10139499, ...,  0.10707912,
         0.10700564,  0.10707912],
       [ 0.06462171,  0.06750617,  0.06474994, ...,  0.06603212,
         0.06589464,  0.06603212],
       ..., 
       [ 0.02264866,  0.0295766 ,  0.02894969, ...,  0.0333135 ,
         0.03280327,  0.0333135 ],
       [ 0.13787866,  0.14879434,  0.15712914, ...,  0.16418798,
         0.16103   ,  0.16418798],
       [ 0.20838937,  0.20836976,  0.22797989, ...,  0.26115407,
         0.25438912,  0.26115407]])

In [28]:
user_prediction

array([[ 1.65140747,  0.54103591,  0.46230324, ...,  0.27988143,
         0.28216935,  0.27988143],
       [ 1.44176212,  0.27564244,  0.14983777, ..., -0.0630468 ,
        -0.05973455, -0.0630468 ],
       [ 1.43947632,  0.22710632,  0.11306552, ..., -0.10388636,
        -0.10063947, -0.10388636],
       ..., 
       [ 1.2910051 ,  0.1835758 ,  0.07237968, ..., -0.13370266,
        -0.13059812, -0.13370266],
       [ 1.47416836,  0.30707497,  0.21389721, ...,  0.00306285,
         0.00575785,  0.00306285],
       [ 1.509677  ,  0.36769748,  0.29411923, ...,  0.10779041,
         0.11015477,  0.10779041]])

In [29]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [31]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.1247984289924826
Item-based CF RMSE: 3.451519229063875
