In [20]:
import numpy as np
import pandas as pd

In [21]:
df = pd.read_csv('u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

In [22]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [23]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [24]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [25]:
n_users = df['user_id'].nunique()
n_items = df['item_id'].nunique()

print('Number of Users: '+ str(n_users))
print('Number of Movies: '+str(n_items))

Number of Users: 944
Number of Movies: 1682


In [26]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.25)

In [27]:
train_data_matrix = np.zeros((n_users, n_items))


for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 
    
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]


In [28]:
train_data_matrix.shape

(944, 1682)

In [29]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [30]:
print(user_similarity.shape)
print(item_similarity.shape)

(944, 944)
(1682, 1682)


<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?\hat{x}_{k,m}&space;=&space;\frac{\sum\limits_{i_b}&space;sim_i(i_m,&space;i_b)&space;(x_{k,b})&space;}{\sum\limits_{i_b}|sim_i(i_m,&space;i_b)|}"/>



similarity of item m and item b dot product rating given by user k on item b

In [31]:
def PredictItem(rating,similarity):
    p=[]
    a=rating.dot(similarity)
    b=np.abs(item_similarity.sum(axis=1))
    for i in range(rating.shape[0]):
        for j in range(rating.shape[1]):
            p.append(a[i][j]/b[j])
    p=np.reshape(p,(944,1682))
    return(p)


In [32]:
def PredictUser(train_data_matrix,user_similarity):
    mean_user_rating = train_data_matrix.mean(axis=1)
    ratings_diff = (train_data_matrix - mean_user_rating[:, np.newaxis])
    num=user_similarity.dot(ratings_diff)
    denom=np.array([np.abs(user_similarity).sum(axis=1)]).T
    a=[]
    for i in range(num.shape[0]):
        for j in range(num.shape[1]):
                a.append(num[i][j]/denom[i][0])
    a=np.reshape(a,(944,1682))
    pred = mean_user_rating[:, np.newaxis] + a
    return pred

In [33]:
item_prediction = PredictItem(train_data_matrix, item_similarity)
user_prediction = PredictUser(train_data_matrix, user_similarity)

In [36]:
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    c=0
    for i in range(len(prediction)):
        d=(prediction[i]-ground_truth[i])**2
        c=c+d
    c=c/len(prediction)
    return sqrt(c)

In [37]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.1302865331536407
Item-based CF RMSE: 3.458814905046522


In [38]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds


u, s, vt = svds(train_data_matrix)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('SVD RMSE: ' + str(rmse(X_pred, test_data_matrix)))

SVD RMSE: 2.720353695243045
