In [18]:
import pandas as pd #load data into table formats(data frames)
datafile='BX-CSV-Dump\\BX-Book-Ratings.csv'
data= pd.read_csv(datafile,sep=";",encoding = "ISO-8859-1",header=0,names=["user","isbn","rating"])

In [19]:
bookfile='BX-CSV-Dump\\BX-Books.csv'
#error_bad_lines : ignores any rows that have error: in this case rows with more columns than expected
#index_col : col names should be taken from row 0 isbn col
books = pd.read_csv(bookfile,encoding = "ISO-8859-1",sep=";",header=0,error_bad_lines=False,usecols=[0,1,2],index_col=0,names=["isbn","title","author"])

In [20]:
def favBooks(user, N):
    #get all data related to current user
    userRatings=data[data["user"]==user]
    #sort the current users ratings in descending order and pick top N rated books
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'], ascending=0)[:N]
    #add the title column to the N sorted highly rated books
    sortedRatings["title"]=sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [30]:
#Some isbns may be in the rating dataframe but not in the book metadata, so reduce data to only isbns also present in books metadata dataframe
data = data[data["isbn"].isin(books.index)]

# Create a rating matrix using scipy as easier to find empty ratings

In [35]:
from scipy.sparse import coo_matrix 
data['user']=data['user'].astype("category")
data['isbn']=data['isbn'].astype("category")

R = coo_matrix((data['rating'].astype(float),
                (data['user'].cat.codes.copy(),
                 data['user'].cat.codes.copy())))

In [36]:
R.shape

(92107, 92107)

In [38]:
#R.data gets the non null elements of the matrix - this no is far less than prod of rows and cols
len(R.data)

1031175

In [39]:
#first element(rating) of data array 
R.data[0]

0.0

In [40]:
R.row[0]

91363

In [41]:
R.col[0]

91363

# Initialize Factor matrics

In [42]:
M,N = R.shape
K=3

In [43]:
#Generate the P and Q matrics with random numbers
import numpy as np
P = np.random.rand(M, K)
Q = np.random.rand(K, N)

In [54]:
#Compute Error 
from numpy.linalg import norm

def error(R, P, Q, lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e=0
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0 :
            e=e+pow(rui-np.dot(P[u,:], Q[:,i]), 2)+\
                    lamda*(pow(norm(Q[:,i]), 2))
    return e

In [55]:
error(R,P,Q)

19516481.63548536

In [56]:
#commute the root mean squared error
rmse =  np.sqrt(error(R,P,Q)/len(R.data))
rmse

4.350453949892627

## Reduce this error using the stochastic Gradient Descent  

In [61]:
#gamma is step size, determine gamma lammda by trial and error
def SGD(R, K, lamda=0.02, steps=1, gamma=0.001):
    M,N=R.shape
    P = np.random.rand(M, K)
    Q = np.random.rand(K, N)
    rmse =  np.sqrt(error(R,P,Q, lamda)/len(R.data))
    print("Intial RMSE: "+str(rmse))
    for step in range(steps):
        ratings = R.data
        rows = R.row
        cols = R.col
        #iterate throough each rating and update values of Pu and Qi
        for ui in range(len(R.data)):
            rui=ratings[ui]
            u = rows[ui]
            i = cols[ui]
            if rui>0 :
                eui = rui-np.dot(P[u,:], Q[:,i])
                P[u,:]=P[u,:]+gamma*2*(eui*Q[:,i]-lamda*P[u,:])
                Q[:,i]=Q[:,i]+gamma*2*(eui*P[u,:]-lamda*Q[:,i])
        rmse =  np.sqrt(error(R,P,Q,lamda)/len(R.data))
        if rmse<0.5:
            break
    print("Final RMSE: "+str(rmse))
    return P,Q

In [62]:
(P,Q)=SGD(R,K=2, gamma=0.0007, lamda=0.01, steps=2)

Intial RMSE: 4.492189899990504
Final RMSE: 3.3487574751010785


In [63]:
(P,Q)=SGD(R,K=2, gamma=0.0007, lamda=0.01, steps=100)

Intial RMSE: 4.497370739034269
Final RMSE: 1.2862262919853902
