# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
!wc -l u.data

100003 u.data


In [3]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [4]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [6]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print (str(n_users) + ' users')
print (str(n_items) + ' items')

944 users
1682 items


### Making user item matrix

In [7]:
ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 6.30%


### Splitting into test set and training set

In [13]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=10, 
                                        replace=True)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

In [14]:
train, test = train_test_split(ratings)

In [15]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train, metric='cosine')
item_similarity = pairwise_distances(train.T, metric='cosine')

In [16]:
print(user_similarity)
print (item_similarity)

[[0.         0.81662087 0.97067758 ... 0.83706396 0.62854848 1.        ]
 [0.81662087 0.         0.9178093  ... 0.84337563 0.88248319 1.        ]
 [0.97067758 0.9178093  0.         ... 0.94594245 0.96912414 1.        ]
 ...
 [0.83706396 0.84337563 0.94594245 ... 0.         0.85190893 1.        ]
 [0.62854848 0.88248319 0.96912414 ... 0.85190893 0.         1.        ]
 [1.         1.         1.         ... 1.         1.         0.        ]]
[[0.         0.58750554 0.69205478 ... 1.         0.9490931  0.9490931 ]
 [0.58750554 0.         0.74734059 ... 1.         0.91970684 0.91970684]
 [0.69205478 0.74734059 0.         ... 1.         1.         0.89776027]
 ...
 [1.         1.         1.         ... 0.         1.         1.        ]
 [0.9490931  0.91970684 1.         ... 1.         0.         1.        ]
 [0.9490931  0.91970684 0.89776027 ... 1.         1.         0.        ]]


### Prediting Values  

In [17]:
def predict(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [18]:
user_pred=predict(train, user_similarity, kind='user')
item_pred=predict(train, item_similarity, kind='item')

In [19]:
print(user_pred)

[[1.42105729e+00 3.26230819e-01 2.12207585e-01 ... 2.56338474e-03
  2.10987805e-03 1.89706277e-03]
 [1.56883514e+00 4.38774898e-01 2.53112535e-01 ... 1.92325622e-03
  3.19720570e-03 3.39021116e-03]
 [1.63532583e+00 4.34132130e-01 2.62468563e-01 ... 1.26689831e-03
  3.16917022e-03 3.37018646e-03]
 ...
 [1.56595686e+00 3.97436387e-01 2.55476700e-01 ... 2.00892156e-03
  2.75860312e-03 3.22411834e-03]
 [1.44599207e+00 3.18819451e-01 2.17218503e-01 ... 2.58880189e-03
  2.07260503e-03 2.27544345e-03]
 [1.59915164e+00 4.22057264e-01 2.59809120e-01 ... 2.12089077e-03
  3.18133616e-03 3.18133616e-03]]


In [20]:
print(item_pred)

[[0.43605271 0.45993968 0.48839865 ... 0.56532555 0.5508483  0.5452238 ]
 [0.09405238 0.1136001  0.10725901 ... 0.11549369 0.11684202 0.11748322]
 [0.06940067 0.07352766 0.07033525 ... 0.06698354 0.07174712 0.07281834]
 ...
 [0.140961   0.15230982 0.16315926 ... 0.17074351 0.16922383 0.17218836]
 [0.23706228 0.23098396 0.26822104 ... 0.32305118 0.30969883 0.31122361]
 [0.         0.         0.         ... 0.         0.         0.        ]]


### Calculating Mean Square Error

In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [27]:
print ('User-based CF RMSE: ' + str(sqrt(get_mse(user_pred, test))))

User-based CF RMSE: 3.1849427002773663


In [28]:
print ('Item-based CF RMSE: ' + str(sqrt(get_mse(item_pred, test))))

Item-based CF RMSE: 3.5926258421502366


# Finished