In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.metrics import precision_score, recall_score

In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

In [3]:
ratings_base

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


In [10]:
rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [11]:
class MF(object):
    def __init__(self, Y_data, K, lam = 0.1, Xinit = None, Winit = None,
            learning_rate = 0.5, max_iter = 1000, print_every = 100, user_based = 1):
        self.Y_raw_data = Y_data
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.user_based = user_based
        self.n_users = int(np.max(Y_data[:, 0])) + 1
        self.n_items = int(np.max(Y_data[:, 1])) + 1
        self.n_ratings = Y_data.shape[0]

        if Xinit is None: 
            self.X = np.random.randn(self.n_items, K)
        else: 
            self.X = Xinit

        if Winit is None:
            self.W = np.random.randn(K, self.n_users)
        else: 
            self.W = Winit

        self.Y_data_n = self.Y_raw_data.copy()


    def normalize_Y(self):
        user_col = 0
        item_col = 1
        n_objects = self.n_users
        users = self.Y_raw_data[:, user_col]
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            ids = np.where(users == n)[0].astype(np.int32)
            item_ids = self.Y_data_n[ids, item_col]
            ratings = self.Y_data_n[ids, 2]
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[n] = m
            self.Y_data_n[ids, 2] = ratings - self.mu[n]
    def loss(self):
        L = 0
        for i in range(self.n_ratings):
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), self.Y_data_n[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2

        L /= self.n_ratings
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L
    def get_items_rated_by_user(self, user_id):
        ids = np.where(self.Y_data_n[:,0] == user_id)[0]
        item_ids = self.Y_data_n[ids, 1].astype(np.int32) 
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)
    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + \
                        self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.K,))
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print( 'iter =', it + 1, ', loss =', self.loss(), ', RMSE train =', rmse_train)
    def pred(self, u, i):
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else:
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias
        if pred < 0:
            return 0
        if pred > 5:
            return 5
        return pred


    def pred_for_user(self, user_id):
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()

        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings= []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))

        return predicted_ratings
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2

        RMSE = np.sqrt(SE/n_tests)
        return RMSE
    def precision_recall(self, rate_test, threshold=4):
        n_tests = rate_test.shape[0]
        true_positives = 0
        false_positives = 0
        false_negatives = 0

        for n in range(n_tests):
            user_id, item_id, true_rating = int(rate_test[n, 0]), int(rate_test[n, 1]), rate_test[n, 2]
            pred_rating = self.pred(user_id, item_id)

            if true_rating >= threshold and pred_rating >= threshold:
                true_positives += 1
            elif true_rating >= threshold and pred_rating < threshold:
                false_negatives += 1
            elif true_rating < threshold and pred_rating >= threshold:
                false_positives += 1

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        return precision, recall

In [12]:
rs = MF(rate_train, K = 10, lam = .1, print_every = 10, learning_rate = 0.75, max_iter = 100, user_based = 1)
rs.fit()

iter = 10 , loss = 10.005548853893796 , RMSE train = 1.5949978835251288
iter = 20 , loss = 7.960475092488835 , RMSE train = 1.2085316650691824
iter = 30 , loss = 7.238816056974136 , RMSE train = 1.0718038311970455
iter = 40 , loss = 6.952800271395312 , RMSE train = 1.0391232228786005
iter = 50 , loss = 6.831279656609473 , RMSE train = 1.0321629922305138
iter = 60 , loss = 6.777795931562965 , RMSE train = 1.0306671103065526
iter = 70 , loss = 6.753956906578253 , RMSE train = 1.0303287408769441
iter = 80 , loss = 6.743478177857028 , RMSE train = 1.0302447706197468
iter = 90 , loss = 6.739234658773512 , RMSE train = 1.030220963579524
iter = 100 , loss = 6.737846230646459 , RMSE train = 1.0302131362944635


In [13]:
RMSE = rs.evaluate_RMSE(rate_test)
precision, recall = rs.precision_recall(rate_test, threshold=4)
print ('\nUser-based MF, RMSE =', RMSE)
print("Precision:", precision)
print("Recall:", recall)


User-based MF, RMSE = 1.043103677377425
Precision: 0.8095830740510268
Recall: 0.23788626805631743
