In [1]:
# import libraries
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [5]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [6]:
df_ratings.loc[4, :][2]

6.0

In [7]:
df_ratings.iloc[4, 2]

6.0

In [8]:
df_books.isnull().sum()
df_ratings.isnull().sum()

user      0
isbn      0
rating    0
dtype: int64

In [9]:
# drop all rows with null values
df_books = df_books.dropna()
df_ratings = df_ratings.dropna()

In [10]:
# find users with less (or more/equal) than 200 ratings
user_ratings = df_ratings['user'].value_counts()
invalid_users = set(user_ratings[user_ratings < 200].keys())
valid_users = set(user_ratings[user_ratings >= 200].keys())

In [11]:
df_ratings.loc[df_ratings['user'].isin(valid_users)]

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1457,277427,0026217457,0.0
1458,277427,003008685X,8.0
1459,277427,0030615321,0.0
1460,277427,0060002050,0.0
...,...,...,...
1147612,275970,3829021860,0.0
1147613,275970,4770019572,0.0
1147614,275970,896086097,0.0
1147615,275970,9626340762,8.0


In [12]:
df_ratings.loc[~df_ratings['user'].isin(invalid_users)]

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1457,277427,0026217457,0.0
1458,277427,003008685X,8.0
1459,277427,0030615321,0.0
1460,277427,0060002050,0.0
...,...,...,...
1147612,275970,3829021860,0.0
1147613,275970,4770019572,0.0
1147614,275970,896086097,0.0
1147615,275970,9626340762,8.0


In [13]:
# save only users with or more than 200 ratings
df_valid = df_ratings.loc[~df_ratings['user'].isin(invalid_users)]

In [14]:
df_valid.shape

(527556, 3)

In [15]:
# find books with less than 100 ratings
book_ratings = df_ratings['isbn'].value_counts()
invalid_books = set(book_ratings[book_ratings < 100].keys())
valid_books = set(book_ratings[book_ratings >= 100].keys())

In [16]:
df_books.loc[df_books['isbn'].isin(valid_books)]

Unnamed: 0,isbn,title,author
18,0440234743,The Testament,John Grisham
19,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison
26,0971880107,Wild Animus,Rich Shapero
27,0345402871,Airframe,Michael Crichton
28,0345417623,Timeline,MICHAEL CRICHTON
...,...,...,...
28072,0425178765,Easy Prey,John Sandford
29215,0449223604,M Is for Malice,Sue Grafton
30535,0345444884,The Talisman,STEPHEN KING
30775,0060008032,Angels,Marian Keyes


In [17]:
df_books[df_books['isbn'] == "0130897930"]

Unnamed: 0,isbn,title,author
271361,130897930,Core Web Programming (2nd Edition),Marty Hall


In [18]:
df_ratings[df_ratings['isbn'] == "0130897930"].count()

user      0
isbn      0
rating    0
dtype: int64

In [19]:
# save only books with more than 100 ratings
df_valid = df_valid.loc[df_valid['isbn'].isin(valid_books)]

In [20]:
df_valid.shape

(49781, 3)

In [21]:
# prepare dataset
valid_arr = df_valid.to_numpy()

In [22]:
df_table = df_valid.pivot_table(index='user', columns='isbn', values='rating', fill_value=-1).T
print(df_table.shape)
df_table.head()

(731, 888)


user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,10,-1,-1,-1
0060008032,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0060096195,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
006016848X,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,0,-1,-1,-1,-1,-1,-1,0
0060173289,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [23]:
df_table.index = df_table.join(df_books.set_index('isbn'))['title']

In [24]:
df_table = df_table.sort_index()
df_table.head()

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,0,-1,-1,-1,-1
1st to Die: A Novel,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1st to Die: A Novel,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2nd Chance,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,0,-1,-1,-1,-1,-1,-1
2nd Chance,-1,10,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,0,-1,-1,-1,-1,0,-1


In [30]:
# build model
neighbours = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute').fit(df_table)

In [31]:
distances, indices = neighbours.kneighbors(df_table)    # no point in checking the table with itself

In [72]:
indices

array([[  0, 487,  49, 207, 481],
       [  1, 668, 277, 299, 124],
       [  2,  47,   4, 299, 696],
       ...,
       [728, 372, 124, 146, 321],
       [729, 697, 617,  51, 313],
       [730, 715, 119, 372, 196]], dtype=int64)

In [66]:
df_table.iloc[0].shape

(888,)

In [71]:
df_table.loc['1984']

user
254       9
2276     -1
2766     -1
2977     -1
3363     -1
         ..
275970    0
277427   -1
277478   -1
277639   -1
278418   -1
Name: 1984, Length: 888, dtype: int64

In [84]:
distance, indice = neighbours.kneighbors([df_table.loc['The Queen of the Damned (Vampire Chronicles (Paperback))'].values])
print(distance)
print(indice)

[[0.         0.42742414 0.46431002 0.51854466 0.54880727]]
[[612 648 660 110 372]]


In [85]:
df_table.iloc[indice[0]].index.values

array(['The Queen of the Damned (Vampire Chronicles (Paperback))',
       'The Tale of the Body Thief (Vampire Chronicles (Paperback))',
       'The Vampire Lestat (Vampire Chronicles, Book II)', 'Catch 22',
       'Pleading Guilty'], dtype=object)

In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
    
    return recommended_books