## <span style="color:royalblue">Movie Classification</span>

In [1]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import coo_matrix, csr_matrix
from collections import namedtuple
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, confusion_matrix

### <span style="color:royalblue">Step 1. Load the movie ratings data (as in the HW3-recommender-system) and use matrix factorization technique(s) and predict the missing ratings from the test data. Measure the RMSE. You should use sklearn library. </span>

In [2]:
movies = pd.read_csv('data/movies.csv')
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
users = pd.read_csv('data/users.csv')

In [3]:
display(movies.head(2))
display(test.head(2))
display(train.head(2))
display(users.head(2))

Unnamed: 0,mID,title,year,Doc,Com,Hor,Adv,Wes,Dra,Ani,...,Chi,Cri,Thr,Sci,Mys,Rom,Fil,Fan,Act,Mus
0,1,Toy Story,1995,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0


Unnamed: 0,uID,mID,rating
0,2233,440,4
1,4274,587,5


Unnamed: 0,uID,mID,rating
0,744,1210,5
1,3040,1584,4


Unnamed: 0,uID,gender,age,accupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072


For this part of the assignment, I will use the same methodolgy as the assignment

In [4]:
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(users, movies, train, test)

In [5]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())


    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        # your code here
        
        return np.ones(len(self.data.test))*3
        
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
        
        user_avg = self.Mr.sum(axis=1)/(self.Mr>0).sum(axis=1)
        return user_avg[[self.uid2idx[x] for x in self.data.test.uID]]
    
    def predict_from_sim(self,uid,mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        # your code here
        
        movie_index = self.Mr[self.uid2idx[uid]]
        sim_scores = self.sim[self.mid2idx[mid]]
        return np.dot(movie_index,sim_scores)/np.dot(sim_scores,movie_index>0)
        
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        
        predictions = []
        for i in range(len(self.data.test)):
            x = self.data.test.iloc[i]
            mid=x.mID
            uid=x.uID
            predictions.append(self.predict_from_sim(uid,mid))
        return np.array(predictions)
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

In [6]:
rs = RecSys(data)
mat = rs.Mr

In [7]:
from sklearn.decomposition import TruncatedSVD

In [8]:
model = TruncatedSVD(n_components=18, n_iter=7, random_state=42)
transform = model.fit_transform(mat)
components = model.components_

In [9]:
user_data = model.inverse_transform(transform)
user_data.shape

(6040, 3883)

In [10]:
# predictions (using the same formula as above):
predictions = []
for i in range(len(rs.data.test)):
    x = rs.data.test.iloc[i]
    mid=x.mID
    uid=x.uID
    predictions.append(user_data[rs.uid2idx[uid],rs.mid2idx[mid]])
predictions = np.array(predictions)

In [11]:
# Using the same RMSE method as above
# predictions = np.asarray(predictions
predictions[np.isnan(predictions)] = 3
yt=np.array(rs.data.test.rating)
rmse = np.sqrt(((yt-predictions)**2).mean())
print("RMSE:", rmse)

RMSE: 2.831671354161693


2. Discuss the results and why sklearn's non-negative matrix facorization library did not work well compared to simple baseline or similarity-based methods we’ve done in Module 3. Can you suggest a way(s) to fix it?

The RMSE is very high (compared to week 3 homework which was around or below 1). The reason why it did not work was because the data is too sparse and that can cause problems with trying to predictions.

Some methods which might fix this problem is if we use PCA which would reduce the dimension of the matrix and could result in a better performing model.