In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import sklearn
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import NMF
from pytest import approx

<b> <center> <BIG> <BIG> <BIG> PART 1

<em> We factorized the matrix using the number of genres as latent dimension. Then we constructed the sim matrix to get the predictions.

In [None]:
path = 'Downloads/unsupervised/Files/Module3/'
MV_users = pd.read_csv(path+'data/users.csv')
MV_movies = pd.read_csv(path+'data/movies.csv')
train = pd.read_csv(path+'data/train.csv')
test = pd.read_csv(path+'data/test.csv')

In [None]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [65]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())


    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        return 3*np.ones(len(self.data.test.rating))
        
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
        t = self.Mr.sum(axis=1)
        u = (self.Mr>0).sum(axis=1)
        v = t/u
        return np.array([v[self.uid2idx[x]] for x in self.data.test.uID])
        
    def predict_from_sim(self,uid,mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        # your code here
        r = self.Mr[self.uid2idx[uid]]
        s = self.sim[self.mid2idx[mid]]
        idx = np.where(r>0)
        return np.dot(r,s)/(s[idx].sum()+0.01)
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        return np.array([self.predict_from_sim(uid,mid) for (uid,mid) in zip(self.data.test.uID,self.data.test.mID)])
        
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

    def Nmf(self):
        C = csr_matrix(self.Mr.T)
        X = NMF(n_components=len(self.genres),solver='mu',
                                   beta_loss="frobenius",alpha_W=0.00005,alpha_H=0.00005,l1_ratio=0.5,).fit(C)
        W = X.transform(C)
        return W
    def calc_item_item_similarity(self):
        """
        Create item-item similarity using Jaccard similarity
        """
        # Update the sim matrix by calculating item-item similarity using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B| 
        ##CONVERT TO BOOLEAN ARRAY By comparing mean value and then use jaccardian
        arr_MM = self.Nmf()
        arr_bool = np.array([arr_MM[i]>arr_MM[i].mean() for i in range(len(arr_MM))])
        self.sim = 1 - pairwise_distances(arr_bool, metric="jaccard")
    
    

In [66]:
sample_cb = RecSys(data)
sample_cb.calc_item_item_similarity()
sample_yp = sample_cb.predict()
sample_rmse = sample_cb.rmse(sample_yp)
print(sample_rmse)
sample_cb.predict_from_sim(2026,2436)

0.9698748083035741


3.558746929352515

In [None]:
# for a, b in zip(sample_MV_users.uID, sample_MV_movies.mID):
#     print(a, b, sample_cb.predict_from_sim(a,b))

# Sample tests for predict_from_sim in RecSys class 
assert(sample_cb.predict_from_sim(245,276)==approx(2.5128205128205128,abs=1e-2)), "Check predict_from_sim. Look at how you predicted a user rating on a movie given UserID and movieID."
assert(sample_cb.predict_from_sim(2026,2436)==approx(2.785714285714286,
                                                     abs=1e-2)), "Check predict_from_sim. Look at how you predicted a user rating on a movie given UserID and movieID."

<b> <center> <BIG> <BIG> <BIG> PART 2

 <em> The result in the RMSE category is better with Nmf but the individual predictions are not so good. RMSE means the root of mean of the total sample error of each data entry. The RMSE is better than those of baseline methods as clearly can be seen.