In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.spatial.distance import hamming
from numpy.linalg import norm

In [2]:
file='BookRatings.csv'
#f=file.data.decode('utf8')
rating_data=pd.read_csv(file,sep=";",encoding = "ISO-8859-1")
rating_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
books_data=pd.read_csv('Books.csv',sep=";",error_bad_lines=False,usecols=[0,1,2],index_col=[0],encoding = "ISO-8859-1")
books_data.head()

Unnamed: 0_level_0,Book-Title,Book-Author
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [4]:
def booksmetadata(isbn):
    return books_data[books_data.index==isbn]['Book-Title'],books_data[books_data.index==isbn]['Book-Author']
    

In [5]:
booksmetadata('034545104X')

(ISBN
 034545104X    Flesh Tones: A Novel
 Name: Book-Title, dtype: object, ISBN
 034545104X    M. J. Rose
 Name: Book-Author, dtype: object)

In [6]:
rating_data=rating_data[rating_data['ISBN'].isin(books_data.index)]

In [7]:
def favbooks(user,N):
    UserRating= rating_data[rating_data['User-ID']==user].sort_values('Book-Rating',ascending=False)[:N]
    #BookByUser=pd.merge(UserRating,books_data,on=UserRating['ISBN'])
    UserRating['Title']=UserRating['ISBN'].apply(booksmetadata)
    return UserRating

In [8]:
favbooks(204622,5)

Unnamed: 0,User-ID,ISBN,Book-Rating,Title
844954,204622,0967560500,10,"([Natural Hormonal Enhancement], [Rob Faigin])"
844934,204622,0671027360,10,"([Angels &amp; Demons], [Dan Brown])"
844925,204622,0385504209,10,"([The Da Vinci Code], [Dan Brown])"
844957,204622,097173660X,9,"([Life After School Explained], [Cap &amp; Com..."
844919,204622,0060935464,9,"([To Kill a Mockingbird], [Harper Lee])"


## Problem Statement- Based on the above top 5 books read by this user an we recommend some other books

In [9]:
rating_data.shape

(1031174, 3)

In [10]:
rating_data['ISBN'].value_counts().shape

(270170,)

In [11]:
rating_data['User-ID'].value_counts().shape

(92106,)

In [12]:
UsersISBN=rating_data['ISBN'].value_counts()
rating_data=rating_data[rating_data['ISBN'].isin(UsersISBN[UsersISBN>10].index)]

In [13]:
ISBNperUsers=rating_data['User-ID'].value_counts()
rating_data=rating_data[rating_data['User-ID'].isin(ISBNperUsers[ISBNperUsers>10].index)]
rating_data

Unnamed: 0,User-ID,ISBN,Book-Rating
172,276847,0446364193,0
174,276847,3379015180,0
176,276847,3404148576,8
178,276847,3423071516,10
185,276847,3442413508,10
188,276847,3442437717,7
191,276847,3442444020,8
192,276847,3442446414,10
193,276847,3442448530,7
209,276847,3551551677,10


In [14]:
Rating_matrix=pd.pivot_table(rating_data,index='User-ID',columns='ISBN',values='Book-Rating')

In [15]:
User1=204622
User2=255489
User1Rating=Rating_matrix.transpose()[User1]
User2Rating=Rating_matrix.transpose()[User2]

In [16]:
User1Rating[User1Rating.notnull()]

ISBN
006016848X     0.0
0060935464     9.0
0140042598     0.0
0140178724     0.0
0142004278     4.0
0380732238     0.0
0385504209    10.0
0425109720     0.0
0425152898     0.0
0440136482     0.0
0440241162     0.0
0451191145     0.0
0451197127     0.0
0553096060     6.0
0671027360    10.0
0671027387     0.0
0671666258     8.0
0688174574     0.0
0743225708     0.0
076790592X     0.0
0785264280     0.0
0786868716     7.0
0802131867     0.0
0802132952     0.0
0971880107     0.0
1853260045     0.0
1853260126     0.0
1853260207     0.0
185326041X     0.0
1878424114     9.0
Name: 204622, dtype: float64

In [17]:
hamming(User1Rating,User2Rating)

0.9999352373550936

In [18]:
def computedistance(User1,User2):
    try:
        User1Rating=Rating_matrix.T[User1]
        User2Rating=Rating_matrix.T[User2]
        distance=hamming(User1,User2)
    except:
        distance=np.NaN
    return distance

In [19]:
computedistance(204622,10118)

1.0

In [20]:
active_user=204622
all_users=pd.DataFrame(Rating_matrix.index)
all_users=all_users[all_users['User-ID']!=active_user]

In [21]:
all_users['Distance']=all_users['User-ID'].apply(lambda x: computedistance(active_user,x))

In [22]:
all_users.head()

Unnamed: 0,User-ID,Distance
0,243,1.0
1,254,1.0
2,383,1.0
3,388,1.0
4,424,1.0


In [23]:
K=10
KnearestUsers=all_users.sort_values('Distance',ascending=False)[:K]
KnearestUsers

Unnamed: 0,User-ID,Distance
0,243,1.0
4300,187256,1.0
4310,187624,1.0
4309,187613,1.0
4308,187606,1.0
4307,187598,1.0
4306,187574,1.0
4305,187520,1.0
4304,187517,1.0
4303,187425,1.0


In [24]:
def Knearest(activeuser,K):
    alluser=pd.DataFrame(Rating_matrix.index)
    alluser=alluser[alluser['User-ID']!=activeuser]
    alluser['distance']=alluser['User-ID'].apply(lambda x:computedistance(activeuser,x))
    kusers=alluser.sort_values('distance',ascending=False)['User-ID'][:K]
    return kusers

In [25]:
kusers=Knearest(204622,10)
kusers

0          243
4300    187256
4310    187624
4309    187613
4308    187606
4307    187598
4306    187574
4305    187520
4304    187517
4303    187425
Name: User-ID, dtype: int64

In [26]:
KnnRating=Rating_matrix[Rating_matrix.index.isin(kusers)]
KnnRating

ISBN,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,,,,,,,,,,,...,,,,,,,,,,
187256,,,,,,,,,,,...,,,,,,,,,,
187425,,,,,,,,,,,...,,,,,,,,,,
187517,,,,,,,,,,,...,,,,,,,,,,
187520,,,,,,,,,,,...,,,,,,,,,,
187574,,,,,,,,,,,...,,,,,,,,,,
187598,,,,,,,,,,,...,,,,,,,,,,
187606,,,,,,,,,,,...,,,,,,,,,,
187613,,,,,,,,,,,...,,,,,,,,,,
187624,,,,,,,,,,,...,,,,,,,,,,


In [27]:
avgrating=KnnRating.apply(np.nanmean).dropna()

  labels=labels)


In [28]:
avgrating

ISBN
0006550789     9.0
0007154615     0.0
0060011912     7.0
0060083948     5.0
0060096195     0.0
0060175400     0.0
0060176075     3.0
0060191929     0.0
006019491X     0.0
0060512377     7.0
0060557818     9.0
0060560754     7.0
0060808934    10.0
0060914653     0.0
0060915544    10.0
0060921145     8.5
0060924349     0.0
0060928336     9.0
006092988X     0.0
0060932759     7.0
0060938455     9.0
0060977493     7.0
0061000299     0.0
0061012882     0.0
006101351X     0.0
006101432X     0.0
0061020648     0.0
0061020680     0.0
0061031445     0.0
0061032476     5.0
              ... 
1551669234     9.0
1551669412     9.0
1551669625     0.0
1557732647     0.0
1558532854    10.0
155874262X     5.0
1558743669     0.0
1558745726     9.0
1558746161     0.0
1560252758     0.0
156402976X    10.0
1565075722     0.0
1565122968     8.0
156836010X     0.0
1579546463     0.0
1853260320     0.0
1853811262     0.0
1860492592     0.0
1860499627     9.0
202025462X    10.0
2070378411     8.0
2253049

In [29]:
BooksAlreadyRead=Rating_matrix.T[active_user].dropna().index
BooksAlreadyRead

Index(['006016848X', '0060935464', '0140042598', '0140178724', '0142004278',
       '0380732238', '0385504209', '0425109720', '0425152898', '0440136482',
       '0440241162', '0451191145', '0451197127', '0553096060', '0671027360',
       '0671027387', '0671666258', '0688174574', '0743225708', '076790592X',
       '0785264280', '0786868716', '0802131867', '0802132952', '0971880107',
       '1853260045', '1853260126', '1853260207', '185326041X', '1878424114'],
      dtype='object', name='ISBN')

In [30]:
avgrating=avgrating[~avgrating.index.isin(BooksAlreadyRead)]

In [31]:
N=3
topISBNS=avgrating.sort_values(ascending=False).index[:N]

In [32]:
topISBNS

Index(['0439136350', '0394580567', '034541389X'], dtype='object', name='ISBN')

In [33]:
RecommendedBooks=pd.Series(topISBNS).apply(booksmetadata)
RecommendedBooks

0    ([Harry Potter and the Prisoner of Azkaban (Bo...
1    ([It Was on Fire When I Lay Down on It], [Robe...
2            ([Flesh and Blood], [Jonathan Kellerman])
Name: ISBN, dtype: object

In [34]:
def toprecommended(active_user,K):
    kusers=Knearest(active_user,K)
    KnnRating=Rating_matrix[Rating_matrix.index.isin(kusers)]
    avgrating=KnnRating.apply(np.nanmean).dropna()
    BooksAlreadyRead=Rating_matrix.T[active_user].dropna().index
    avgrating=avgrating[~avgrating.index.isin(BooksAlreadyRead)]
    topISBNS=avgrating.sort_values(ascending=False).index[:N]
    RecommendedBooks=pd.Series(topISBNS).apply(booksmetadata)
    #RecommendedBooks=RecommendedBooks.append(avgrating[:N])
    return RecommendedBooks

In [35]:
favbooks(204813,10)

Unnamed: 0,User-ID,ISBN,Book-Rating,Title
845416,204813,399149848,10,"([Birthright], [Nora Roberts])"
845406,204813,385504209,10,"([The Da Vinci Code], [Dan Brown])"
845381,204813,373218036,10,"([Truly, Madly Manhattan], [Nora Roberts])"
845358,204813,142001805,10,"([The Eyre Affair: A Novel], [Jasper Fforde])"
845430,204813,446527793,10,"([The Guardian], [Nicholas Sparks])"
845415,204813,399149392,10,([Chesapeake Blue (Quinn Brothers (Hardcover))...
845431,204813,446531332,9,"([Nights in Rodanthe], [Nicholas Sparks])"
845433,204813,446606243,9,"([The Tenth Justice], [Brad Meltzer])"
845450,204813,671027360,9,"([Angels &amp; Demons], [Dan Brown])"
845432,204813,446532452,9,"([The Wedding], [Nicholas Sparks])"


In [36]:
toprecommended(204813,10)

  labels=labels)


0             ([Memoirs of a Geisha], [Arthur Golden])
1    ([The Autobiography of Foudini M. Cat], [Susan...
2                 ([The Color Purple], [Alice Walker])
Name: ISBN, dtype: object

## Recommending Products based on Latent Factor Model

In [37]:
rating_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 379317 entries, 172 to 1149745
Data columns (total 3 columns):
User-ID        379317 non-null int64
ISBN           379317 non-null object
Book-Rating    379317 non-null int64
dtypes: int64(2), object(1)
memory usage: 11.6+ MB


In [38]:
from scipy.sparse import coo_matrix
rating_data['User-ID']=rating_data['User-ID'].astype("category")
rating_data['ISBN']=rating_data['ISBN'].astype("category")
R=coo_matrix((rating_data['Book-Rating'].astype(float),(rating_data['User-ID'].cat.codes.copy(),rating_data['ISBN'].cat.codes.copy())))

In [39]:
R.shape

(6457, 15441)

In [40]:
R.data[0]

0.0

In [41]:
R.row[0]

6414

In [42]:
R.col[0]

6941

In [43]:
M,N=R.shape
K=3

In [44]:
P=np.random.rand(M,K)
Q=np.random.rand(K,N)

In [45]:
def error(R,P,Q,lamda=0.01):
    ratings=R.data
    col=R.col
    row=R.row
    e=0
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u=row[ui]
        i=col[ui]
        if rui>0:
            e=e+pow(rui-np.dot(P[u,:],Q[:,i]),2)+\
            lamda*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2))
    return e

In [46]:
#error(R,P,Q)

In [48]:
#rmse=np.sqrt(error(R,P,Q)/len(R.data))

In [51]:
def SGD(R,K,lamda=0.02,gamma=0.001,steps=10):
    M,N=R.shape
    P=np.random.rand(M,K)
    Q=np.random.rand(K,N)
    rmse=np.sqrt(error(R,P,Q)/len(R.data))
    print('Initial RMSE'+str(rmse))
    for j in xrange(steps):
        for i in range(len(R.data)):
            rui=R.data[i]
            u=R.row[i]
            i=R.col[i]
            if rui>0:
                eui=rui-np.dot(P[u:],Q[:i])
                P[u,:]=P[u:]+gamma*2*(eui*Q[:,i]-lamda*P[u,:])
                Q[:,i]=Q[:,i]+gamma*2*(eui*P[u,:]-lamda*Q[:,i])
        rmse=np.sqrt(error(R,P,Q)/len(R.data))
        if rmse<0.5:
            break
        print("RMSE after optimization"+str(rmse))
        return P,Q

In [None]:
#,Q=SGD(R,K=2,gamma=0.001,lamda=0.01,steps=20)