In [2]:
import os

In [6]:
os.chdir('/Users/admin/Documents/GIT/Book _Recommend_CF_KNN')

In [7]:
os.getcwd()

'/Users/admin/Documents/GIT/Book _Recommend_CF_KNN'

In [8]:
os.scandir

<function posix.scandir>

In [17]:
import pandas as pd
import numpy as np
import scipy as sp

In [11]:
data = pd.read_csv('BX-Book-Ratings.csv', header = 0, sep = ';', names = ['user', 'isbn', 'rating'],\
                   encoding = "latin1")

In [12]:
data.head(5)

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [13]:
books = pd.read_csv('BX-Books.csv', sep = ';', header=0, names = ['isbn', 'title', 'author'], encoding = "latin1", \
                    error_bad_lines=False, usecols = [0,1,2], index_col = 0)

In [14]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


## Shaping up the data to remove too much sparsity

In [24]:
userPerISBN = data.isbn.value_counts()

In [25]:
ISBNsPerUser = data.user.value_counts()

## To avoid a sparse matrix let's reduce the data to include only books with atleast 10 users
## & Users who have read atleast 10 books

In [26]:
data = data[data.user.isin(ISBNsPerUser[ISBNsPerUser > 10].index)]

In [27]:
data = data[data.isbn.isin(userPerISBN[userPerISBN >10].index)]

In [28]:
data.shape

(419407, 3)

## Creating a rating matrix using coo_matrix()
> spl way to store matrix data makes it easy to find values which are non null
>> coo_matrix((values, (rowsource, columnsoure)))...rowsource = user and columnsource = isbn, values = ratings

In [29]:
from scipy.sparse import coo_matrix
data['user'] = data['user'].astype('category')
data['isbn'] = data['isbn'].astype('category')

R = coo_matrix((data['rating'].astype('float'), (data['user'].cat.codes.copy(), data['isbn'].cat.codes.copy())))

In [30]:
R.shape

(105281, 339338)

In [34]:
# R.data is an array that contains only thr non-null value
len(R.data)

419407

In [32]:
R.data[0]

0.0

In [35]:
# Obtain coordinates for element [0] of the data array
R.row[0], R.col[0]

(104449, 56863)

## Initialize the factor matrices

### M = # of users, N = # of Products or ISBNs & K = Number of factors

In [38]:
M, N = R.shape
K = 3

In [40]:
P = np.random.rand(M,K)
Q = np.random.rand(K,N)

In [48]:
from numpy.linalg import norm

def error(R,P,Q,lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0 
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            e= e + pow(rui-np.dot(P[u,:],Q[:,i]),2)+\
                lamda*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2))
    return e

In [50]:
error(R,P,Q)

7407793.4461257961

In [51]:
rmse = np.sqrt(error(R,P,Q)/len(R.data))

In [52]:
rmse

4.2026826277131724

In [55]:
def SGD(R, K, lamda=0.02,steps=10, gamma=0.001):
    
    M,N = R.shape
    P = np.random.rand(M,K)
    Q = np.random.rand(K,N)
    
    rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
    print("Initial RMSE: "+str(rmse))
    
    for step in range(steps):
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                eui=rui-np.dot(P[u,:],Q[:,i])
                P[u,:]=P[u,:]+gamma*2*(eui*Q[:,i]-lamda*P[u,:])
                Q[:,i]=Q[:,i]+gamma*2*(eui*P[u,:]-lamda*Q[:,i])
        rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
        if rmse<0.5:
            break
    print("Final RMSE: "+str(rmse))
    return P,Q

In [56]:
(P,Q)=SGD(R,K=2,gamma=0.0007,lamda=0.01, steps=100)

Initial RMSE: 4.34754586938
Final RMSE: 0.795599938619
