In [1]:
import pandas as pd #load data into table formats(data frames)
datafile='BX-CSV-Dump\\BX-Book-Ratings.csv'
data= pd.read_csv(datafile,sep=";",encoding = "ISO-8859-1",header=0,names=["user","isbn","rating"])

In [2]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
bookfile='BX-CSV-Dump\\BX-Books.csv'
#error_bad_lines : ignores any rows that have error: in this case rows with more columns than expected
#index_col : col names should be taken from row 0 isbn col
books = pd.read_csv(bookfile,encoding = "ISO-8859-1",sep=";",header=0,error_bad_lines=False,usecols=[0,1,2],index_col=0,names=["isbn","title","author"])

In [4]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


# Func to get title and author for a given isbn

In [5]:
def bookMeta(isbn):
    title=books.at[isbn,"title"]
    author=books.at[isbn,"author"]
    return title, author

In [6]:
bookMeta("0060973129")

('Decision in Normandy', "Carlo D'Este")

# Func to get favorite books of a user 

In [7]:
def favBooks(user, N):
    #get all data related to current user
    userRatings=data[data["user"]==user]
    #sort the current users ratings in descending order and pick top N rated books
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'], ascending=0)[:N]
    #add the title column to the N sorted highly rated books
    sortedRatings["title"]=sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [8]:
favBooks(276729, 10)

Unnamed: 0,user,isbn,rating,title
4,276729,0521795028,6,(The Amsterdam Connection : Level 4 (Cambridge...
3,276729,052165615X,3,"(Help!: Level 1, Philip Prowse)"


In [9]:
#Some isbns may be in the rating dataframe but not in the book metadata, so reduce data to only isbns also present in books metadata dataframe
data = data[data["isbn"].isin(books.index)]

In [10]:
favBooks(204622, 10)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"
844966,204622,1878424114,9,(The Seven Spiritual Laws of Success: A Practi...
844937,204622,0671666258,8,"(American Star, Jackie Collins)"
844945,204622,0786868716,7,"(The Five People You Meet in Heaven, Mitch Albom)"
844941,204622,0744001943,7,(Grand Theft Auto: Vice City Official Strategy...
844934,204622,0553096060,6,"(Sein Language, JERRY SEINFELD)"


# Creating a rating matrix

In [11]:
data.shape

(1031175, 3)

In [12]:
usersPerISBN =data.isbn.value_counts()
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [13]:
#number of distinct ISBNs
usersPerISBN.shape

(270170,)

In [14]:
ISBMsPerUSer=data.user.value_counts()
ISBMsPerUSer.head(10)

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
212898     4290
278418     3996
76352      3329
110973     2971
235105     2943
Name: user, dtype: int64

In [15]:
#number of distinct users
ISBMsPerUSer.shape

(92107,)

In [16]:
#Rating matrix with all this data will be very sparse so we reduce this by only considering books that have been read by more than 10 users
data=data[data["isbn"].isin(usersPerISBN[usersPerISBN>100].index)]

In [17]:
#Rating matrix with all this data will be very sparse so we reduce this by only considering users that have read more than 10 books
data=data[data["user"].isin(ISBMsPerUSer[ISBMsPerUSer>100].index)]

In [18]:
data.shape

(63669, 3)

## convert the data frame into rating table using pandas pivot

In [19]:
userItemRatingMatrix=pd.pivot_table(data, values='rating',index=['user'], columns=['isbn'])

In [55]:
userItemRatingMatrix.head(10)

isbn,002542730X,0060008032,0060096195,006016848X,0060173289,0060175400,006019491X,0060199652,0060391626,0060392452,...,1558744630,1558745157,1559029838,1573225517,1573225789,1573227331,1573229326,1573229571,1592400876,1878424319
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,,,,,,,,,...,,,,,,,,,,
507,,,,,,,,,,0.0,...,,,,,,,,,,
882,,,,,,,,,,,...,,,,,,,,,,
1424,,,,,,,,,,,...,,,,,,,,,,
1435,,,,,,,,,,,...,,,,,,,,,,
1733,,,,,,,,,,,...,,,,,,,,,,
1903,,,,,,,,,,0.0,...,,,,,,,,,0.0,
2033,,,,,,,,,,,...,,,,,,,,,,
2110,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,,,,,,,...,,,,,,,,,,


# Compute distance between a pair of users

In [56]:
u1=1424
u2=2276

In [57]:
user1Ratings=userItemRatingMatrix.transpose()[u1]

In [58]:
user2Ratings=userItemRatingMatrix.transpose()[u2]

In [59]:
from scipy.spatial.distance import hamming
hamming(user1Ratings, user2Ratings)

0.99860529986053

In [60]:
import numpy as np
def distance(user1, user2):
    try:
        user1Ratings=userItemRatingMatrix.transpose()[u1]
        user2Ratings=userItemRatingMatrix.transpose()[u2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN
    return distance

In [61]:
distance(204622, 10118)

0.99860529986053

In [66]:
user = 1424
allUsers = pd.DataFrame(userItemRatingMatrix.index)
allUsers = allUsers[allUsers.user!=user]
allUsers.head(6)

Unnamed: 0,user
0,254
1,507
2,882
4,1435
5,1733
6,1903


In [68]:
# add a col distance that says distance between active user and curr user
allUsers["distance"]=allUsers["user"].apply(lambda x: distance(user, x))
allUsers.head(6)

Unnamed: 0,user,distance
0,254,0.998605
1,507,0.998605
2,882,0.998605
4,1435,0.998605
5,1733,0.998605
6,1903,0.998605


In [70]:
#Find K nearest neigbors
K = 10
KnearestUsers = allUsers.sort_values(["distance"], ascending=True)["user"][:K]
KnearestUsers

0          254
1083    187145
1082    187065
1081    186570
1080    186547
1079    186039
1078    185771
1077    185634
1076    185384
1075    185233
Name: user, dtype: int64

## K nearest neighbors  

In [71]:
def nearestNeighbors(user, K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.user!=user]
    allUsers["distance"]=allUsers["user"].apply(lambda x: distance(user, x))
    KnearestUsers = allUsers.sort_values(["distance"], ascending=True)["user"][:K]
    return KnearestUsers

In [72]:
nearestNeighbors(1424)

0          254
1083    187145
1082    187065
1081    186570
1080    186547
1079    186039
1078    185771
1077    185634
1076    185384
1075    185233
Name: user, dtype: int64

# Find Top N Recommendations

In [73]:
#get ratings of nearest neighbors for all books
NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
NNRatings

isbn,002542730X,0060008032,0060096195,006016848X,0060173289,0060175400,006019491X,0060199652,0060391626,0060392452,...,1558744630,1558745157,1559029838,1573225517,1573225789,1573227331,1573229326,1573229571,1592400876,1878424319
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,,,,,,,,,...,,,,,,,,,,
185233,0.0,,,,,,,,0.0,,...,0.0,0.0,0.0,,,,,,,
185384,,,,,,,,,,,...,,,0.0,,,,,,,
185634,,,,,,,,,,,...,,,,,,,,,,
185771,,,,,,,,,,,...,,,,,,,,,,
186039,,0.0,0.0,,,,,,,,...,,,,,,,,,,
186547,,,,,,,,,,,...,,,0.0,,,,,,,
186570,,,,,,,,,,,...,,,0.0,,,,,,,
187065,,,,,,,,,,,...,,,,,,,,,,
187145,,,,,,,,,,,...,,,,,,,,,,


In [74]:
#compute avg ratings excluding NaN for each isbn
avgRating = NNRatings.apply(np.nanmean).dropna()
avgRating.head()

  labels=labels)


isbn
002542730X    0.0
0060008032    0.0
0060096195    0.0
0060391626    0.0
0060502258    0.0
dtype: float64

In [75]:
booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
booksAlreadyRead

Index(['0060930535', '0061015725', '0061097314', '0099771519', '0142001430',
       '0385720106', '0425158616', '0440222656', '0440235162', '0440236673',
       '0446608815', '0452282152', '0743224574', '0767902521', '0786868716',
       '0786885688', '0804106304'],
      dtype='object', name='isbn')

In [76]:
#get averge ratings of only books not already read by active user
avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]

In [77]:
N=3
topNISBNS = avgRating.sort_values(ascending=False).index[:N]
topNISBNS

Index(['0451169530', '0451524934', '0374129983'], dtype='object', name='isbn')

In [78]:
pd.Series(topNISBNS).apply(bookMeta)

0    (The Stand: Complete and Uncut, Stephen King)
1                            (1984, George Orwell)
2              (The Corrections, Jonathan Franzen)
Name: isbn, dtype: object

In [79]:
def topN(user, N=3):
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    topNISBNS = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNISBNS).apply(bookMeta)

In [81]:
favBooks(186570,5)

Unnamed: 0,user,isbn,rating,title
770601,186570,0316666343,7,"(The Lovely Bones: A Novel, Alice Sebold)"
770822,186570,0446364800,7,"(The General's Daughter, Nelson DeMille)"
770630,186570,0345378482,6,"(The Andromeda Strain, MICHAEL CRICHTON)"
770834,186570,0446607711,4,"(The Simple Truth, David Baldacci)"
770634,186570,034538475X,4,(The Tale of the Body Thief (Vampire Chronicle...


In [82]:
topN(186570)

  labels=labels)


0    (The Stand: Complete and Uncut, Stephen King)
1                (Jurassic Park, Michael Crichton)
2                            (1984, George Orwell)
Name: isbn, dtype: object