In [10]:
import numpy as np
import pandas as pd
import math
from helperFunctions import *
from __future__ import division
import random

header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=header)

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

template = pd.DataFrame(np.zeros((n_users, n_items)))
template.index = np.sort(df["user_id"].unique())
template.columns = np.sort(df["item_id"].unique())

Number of users = 943 | Number of movies = 1682


In [12]:
import numpy as np
import pandas as pd
import math

def rating_matrix(ui, template):
    """
    input: u1-u5 base/test
    output: 943 x 1682 rating matrix in np.array
    """
    result = template.copy()
    for index, row in ui.iterrows():
        result.loc[row["user_id"], row["item_id"]] = row["rating"]
    return np.array(result)


def matrix_factorization(R, P, Q, K, steps=20, alpha=0.0002, beta=0.02):
    """
    output: P, Q such that R is approximated by P x Q.T
    """
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] += alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] += alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e += pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e += (beta / 2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

def content_based_score(R, P, Q, K, steps=20, alpha=0.0002, beta=0.02):
    """
    output: only P such that R is approximated by P x Q.T
    """
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] += alpha * (2 * eij * Q[k][j] - beta * P[i][k])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e += pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e += (beta / 2) * pow(P[i][k],2)
        if e < 0.001:
            break
    return P

def user_based_score(R, P, Q, K, steps=20, alpha=0.0002, beta=0.02):
    """
    output: Q.T such that R is approximated by P x Q.T
    """
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        Q[k][j] += alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e += pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e += (beta / 2) * pow(Q[k][j],2)
        if e < 0.001:
            break
    return Q.T


def sse(R, Rhat):
    """
    input: R and Rhat are np.array
    """
    mask = np.where(R > 0)
    err = (R - Rhat)[mask]
    return (err ** 2).sum()

In [15]:
total_sse = 0
for i in range(1, 6):
    ubase = pd.read_csv("u{}.base".format(i), sep='\t', names=header)
    utest = pd.read_csv("u{}.test".format(i), sep='\t', names=header)
    train = rating_matrix(ubase, template)
    test = rating_matrix(utest, template)
    print (ubase.shape)
    print (utest.shape)
    print (len(train[train > 0]))
    print (len(test[test > 0]))
    random.seed(2018 + i)
    print ("Matrix factorization on train set {} started ......".format(i))
    K = 2
    P = np.random.rand(n_users, K)
    Q = np.random.rand(n_items, K)
    Phat, Qhat = matrix_factorization(R = train, P = P, Q = Q, K = K)
    rating_hat = np.dot(Phat, Qhat.T)
    total_sse += sse(test, rating_hat)
    print ("Matrix factorization on train set {} ended".format(i))

(80000, 4)
(20000, 4)
80000
20000
Matrix factorization on train set 1 started ......
Matrix factorization on train set 1 ended
(80000, 4)
(20000, 4)
80000
20000
Matrix factorization on train set 2 started ......
Matrix factorization on train set 2 ended
(80000, 4)
(20000, 4)
80000
20000
Matrix factorization on train set 3 started ......
Matrix factorization on train set 3 ended
(80000, 4)
(20000, 4)
80000
20000
Matrix factorization on train set 4 started ......
Matrix factorization on train set 4 ended
(80000, 4)
(20000, 4)
80000
20000
Matrix factorization on train set 5 started ......
Matrix factorization on train set 5 ended


In [16]:
math.sqrt(total_sse / df.shape[0])

1.2214791611410365