In [1]:
import os

In [2]:
os.getcwd()

'/Users/admin'

In [4]:
os.chdir('/Users/admin/documents/git/recommender_systems/BX-CSV-Dump')

In [5]:
os.getcwd()

'/Users/admin/Documents/GIT/Recommender_Systems/BX-CSV-Dump'

In [6]:
import pandas as pd
import numpy as np
import scipy as sc

In [22]:
data = pd.read_csv('BX-Book-Ratings.csv', header = 0, sep = ';', names = ['user', 'isbn', 'rating'],\
                   encoding = "latin1")

In [23]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [26]:
books = pd.read_csv('BX-Books.csv', sep = ';', header=0, names = ['isbn', 'title', 'author'], encoding = "latin1", \
                    error_bad_lines=False, usecols = [0,1,2], index_col = 0)

In [27]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


## Setup a function to print book metadata

In [49]:
books.loc['0195153448':'0393045218', ['title', 'author']]

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [52]:
books.iloc[0:6, [0,1]]

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber
399135782,The Kitchen God's Wife,Amy Tan


In [56]:
def bookMeta(isbn):
    author = books.at[isbn, 'author']
    title = books.at[isbn, 'title']
    return title, author

In [60]:
bookMeta('0195153448')

('Classical Mythology', 'Mark P. O. Morford')

In [61]:
books.loc['0002005018', ['author', 'title']]

author    Richard Bruce Wright
title             Clara Callan
Name: 0002005018, dtype: object

## Setup a function to find the favorite books of a user

In [66]:
def favBooks(user, N):
    userratings = data[data['user']==user]
    sortedratings = pd.DataFrame(userratings).sort_values('rating', ascending = False)[:N]
    sortedratings['title'] = sortedratings['isbn'].apply(bookMeta)
    return sortedratings
    

In [82]:
favBooks(276729, 5).iloc[0,3]

('The Amsterdam Connection : Level 4 (Cambridge English Readers)',
 'Sue Leather')

In [69]:
data['user'].nunique()

105283

In [71]:
data['user'].value_counts().head()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
Name: user, dtype: int64

In [75]:
pd.crosstab(index=data['user'], columns = 'count').head()

col_0,count
user,Unnamed: 1_level_1
2,1
7,1
8,18
9,3
10,2


In [76]:
favBooks(11676, 5)

Unnamed: 0,user,isbn,rating,title
51016,11676,515090204,10,"(The Berets (Brotherhood of War Book V), W. E...."
54442,11676,812028570,10,"(The New Dog Handbook, H. J. Ullman)"
55426,11676,1551520826,10,"(Luck of the Draw, Chris Gudgeon)"
47006,11676,312868278,10,"(Dark Cities Underground, Lisa Goldstein)"
52648,11676,671722565,10,"(AS YOU LIKE IT, William Shakespeare)"


## Ensure that isbn;s match b/w books and user data tables

In [85]:
data = data[data['isbn'].isin(books.index)]

In [86]:
favBooks(204622, 5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


## User 204622 likes the above top 5 books, Now can we recommend new books to him/her?

In [87]:
# User-rating data set
data.shape

(1031175, 3)

## We need to create the rating matrix (Users on rows and product/isbn along cols & rating values in cells)

In [92]:
# Get Distinct # of ISBNs and the number of ratings per ISBN:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head()

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
Name: isbn, dtype: int64

In [95]:
usersPerISBN.shape

((270170,), 1031175)

In [96]:
# Get distinct users and num of ratings they have given/books read:
ISBNsPerUser = data.user.value_counts()

In [97]:
ISBNsPerUser.head()

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
Name: user, dtype: int64

In [98]:
ISBNsPerUser.shape

(92107,)

## To avoid a sparse matrix let's reduce the data to include only books with atleast 10 users
## & Users who have read atleast 10 books

In [101]:
data = data[data['isbn'].isin(usersPerISBN[usersPerISBN > 10].index)]

In [112]:
data = data[data['user'].isin(ISBNsPerUser[ISBNsPerUser > 10].index)]

## Pivot the data to create a matrix

In [114]:
userItemRatingMatrix = pd.pivot_table(data, values = 'rating', index = 'user', columns = 'isbn')

In [115]:
userItemRatingMatrix.head()

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


## Let's find the K nearest neigbhors for a given user using hamming distance

In [123]:
user1 = 204622
user2 = 255489

In [121]:
userItemRatingMatrix.T.head(3)

user,8,99,242,243,254,383,388,408,424,446,...,278522,278535,278554,278563,278582,278633,278637,278771,278843,278851
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005018,5.0,,,,,,,,,,...,,,,,,,,,,
2251760,,,,,,,,,,,...,,,,,,,,,,
2259834,,,,,,,,,,,...,,,,,,,,,,


## Pandas allows to pick cols from a table but not rows, so we need to transpose

In [125]:
user1Ratings = userItemRatingMatrix.T[user1]
user1Ratings.head()

isbn
0002005018   NaN
0002251760   NaN
0002259834   NaN
0002558122   NaN
0006480764   NaN
Name: 204622, dtype: float64

In [126]:
user2Ratings = userItemRatingMatrix.T[user2]

In [128]:
from scipy.spatial.distance import hamming
hamming(user1Ratings, user2Ratings) ########### Gives the percentage dissagreement in ratings these 2 users have given

0.99993527926995018

## Let's create a function to calculate the distance b/w *"A PAIR of USERS"*

In [129]:
def distance(user1, user2):
    try:
        user1Ratings = userItemRatingMatrix.T[user1]
        user2Ratings = userItemRatingMatrix.T[user2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN
    return distance

In [130]:
distance(204622, 10118)

0.99987055853990037

## Now setup a function to take an active user a num K and find k nearest neighbors...
> Can do this by finding dist b/w a user and all other users and sorting them ascending

In [137]:
user = 204622
allUsers = pd.DataFrame(userItemRatingMatrix.index)

In [138]:
allUsers.head()

Unnamed: 0,user
0,8
1,99
2,242
3,243
4,254


In [139]:
def nearestNeighbors(user, K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.user != user]
    allUsers['distance'] = allUsers.user.apply(lambda x: distance(user, x))
    KnearestUsers = allUsers.sort_values('distance', ascending=True)['user'][:K]
    return KnearestUsers

In [140]:
KnearestUsers = nearestNeighbors(user)

In [143]:
KnearestUsers

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

## Now estimate the top recommendations using the ratings of these K nearest neighbors

In [154]:
def topN(user, N=7):
    KnearestUsers = nearestNeighbors(user)
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    booksAlreadyRead = userItemRatingMatrix.T[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    topNISBNs = avgRating.sort_values(ascending = False).index[:N]
    return pd.Series(topNISBNs).apply(bookMeta)

In [155]:
# Checking top 10 favorite books for user 204813
favBooks(204813, 10)

Unnamed: 0,user,isbn,rating,title
845417,204813,399149848,10,"(Birthright, Nora Roberts)"
845407,204813,385504209,10,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9,"(The Wedding, Nicholas Sparks)"


In [156]:
# Get recommendations for the same user
topN(204813)

  labels=labels)


0    (Waiting For Nick (Silhouette Special Edition)...
1           (Wringer (Trophy Newbery), Jerry Spinelli)
2    (The Star Wars Trilogy: Star Wars, the Empire ...
3          (One, Two, Buckle My Shoe, Agatha Christie)
4                          (On the Road, Jack Kerouac)
5                 (Dead Poets Society, N.H. Kleinbaum)
6     (Go Ask Alice (Avon/Flare Book), James Jennings)
Name: isbn, dtype: object