In [3]:

import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import NMF


import warnings
warnings.filterwarnings('ignore')

In [4]:
movies = pd.read_csv("/content/drive/MyDrive/5510_movies/movies.csv")
users = pd.read_csv("/content/drive/MyDrive/5510_movies/users.csv")
train = pd.read_csv("/content/drive/MyDrive/5510_movies/train.csv")
test = pd.read_csv("/content/drive/MyDrive/5510_movies/test.csv")


In [5]:
movies.head(10)

Unnamed: 0,mID,title,year,Doc,Com,Hor,Adv,Wes,Dra,Ani,...,Chi,Cri,Thr,Sci,Mys,Rom,Fil,Fan,Act,Mus
0,1,Toy Story,1995,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Heat,1995,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
6,7,Sabrina,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck,1995,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,9,Sudden Death,1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,10,GoldenEye,1995,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [6]:
users.head()

Unnamed: 0,uID,gender,age,accupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
print(train.head())
print(test.head())


    uID   mID  rating
0   744  1210       5
1  3040  1584       4
2  1451  1293       5
3  5455  3176       2
4  2507  3074       5
    uID   mID  rating
0  2233   440       4
1  4274   587       5
2  2498   454       3
3  2868  2336       5
4  1636  2686       5


**Let us use Matrix Fact.**

In [8]:
all_users = list(users['uID'])
all_movies = list(movies['mID'])
mid2idx = dict(zip(movies.mID,list(range(len(movies)))))
uid2idx = dict(zip(users.uID,list(range(len(users)))))
movie = [mid2idx[x] for x in train.mID]
user = [uid2idx[x] for x in train.uID]
rating_train = list(train.rating)

Mr = np.array(coo_matrix((rating_train, (user, movie)), shape=(len(all_users), len(all_movies))).toarray())

In [9]:
print(Mr.shape)

(6040, 3883)


In [10]:
#check sparsity
len(Mr.nonzero()[0]) / float(Mr.shape[0] * Mr.shape[1])

0.029852745794625237

In [11]:
model_nmf = NMF(n_components=20)
W = model_nmf.fit_transform(Mr)
#get components/features
H = model_nmf.components_

In [12]:
movie_test = [mid2idx[m] for m in test.mID]
user_test = [uid2idx[u] for u in test.uID]
rating_test = list(test.rating)
Mr_test = np.array(coo_matrix((rating_test, (user_test, movie_test)), shape=(len(all_users), len(all_movies))).toarray())
print(len(Mr_test.nonzero()[0]) / float(Mr_test.shape[0] * Mr_test.shape[1]))



0.012794052185362243


In [13]:
from sklearn.metrics import mean_squared_error

predictions = H.T.dot(W.T).T

print('rmse: ', np.sqrt(mean_squared_error(predictions[Mr_test.nonzero()].flatten(), Mr_test[Mr_test.nonzero()].flatten())))

rmse:  2.854751147491333


# Part - 2



*   Above rmse of 2.85 is not good compared to what we got in previous assigment (around 1.2)

*  NMF performed poorly because the unknown ratings are 0, when factorizing the matrix, for true values between 1-5 it will cause large RMSE. To reduce such error, we can set all unknown ratings to the average (or 3), and then perform NMF on the updated ratings.



Mounted at /content/drive
