In [7]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('Recommend.csv',names=['user_id', 'movie_id', 'rating', 'timestamp'])
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [5]:
from sklearn.model_selection import train_test_split
n_users = df.user_id.unique().shape[0] 
n_movies = df.movie_id.unique().shape[0]
train_data, test_data = train_test_split(df, test_size=0.25)

In [10]:
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 
train_data_matrix

array([[ 0.,  3.,  4., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [11]:
test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix

array([[ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [13]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
movie_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
mean_user_rating = train_data_matrix.mean(axis=1)[:, np.newaxis] 
ratings_diff = (train_data_matrix - mean_user_rating) 
user_pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
user_pred

array([[ 1.61858377,  0.58870348,  0.50174352, ...,  0.3218083 ,
         0.32426292,  0.32396846],
       [ 1.32978389,  0.26645846,  0.13793518, ..., -0.0694414 ,
        -0.06621536, -0.06604007],
       [ 1.33629502,  0.23476666,  0.11778554, ..., -0.09720474,
        -0.09398516, -0.09379887],
       ..., 
       [ 1.19009035,  0.19072282,  0.07091501, ..., -0.12908035,
        -0.1261194 , -0.12591898],
       [ 1.35391011,  0.28592711,  0.18775035, ..., -0.02173392,
        -0.01873293, -0.01853964],
       [ 1.39417895,  0.35386711,  0.27846365, ...,  0.09434391,
         0.09671458,  0.0967313 ]])

In [14]:
movie_pred = train_data_matrix.dot(movie_similarity) / np.array([np.abs(movie_similarity).sum(axis=1)])
movie_pred

array([[ 0.39551165,  0.40839658,  0.42154373, ...,  0.47293278,
         0.46538032,  0.45963631],
       [ 0.084707  ,  0.09724487,  0.09329059, ...,  0.09994051,
         0.09943104,  0.10013536],
       [ 0.06985573,  0.07383708,  0.07184837, ...,  0.07257585,
         0.07231811,  0.07340915],
       ..., 
       [ 0.02630973,  0.03349602,  0.03188073, ...,  0.03688281,
         0.03607168,  0.03666389],
       [ 0.1192004 ,  0.12751924,  0.13497999, ...,  0.14039262,
         0.13866394,  0.14019245],
       [ 0.19502123,  0.19284066,  0.21511892, ...,  0.24687686,
         0.23919759,  0.23980365]])

In [19]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten() 
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, test))

In [20]:
rmse(user_pred, test_data_matrix)

3.138384405490276

In [21]:
rmse(movie_pred, test_data_matrix)

3.4647957787706685