In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

books = pd.read_csv("Books.csv")
users = pd.read_csv("Users.csv")
ratings = pd.read_csv("Ratings.csv")

In [3]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [2]:
print(books.head())

         ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este                1991             HarperPerennial   
3      Gina Bari Kolata                1999        Farrar Straus Giroux   
4       E. J. W. Barber                1999  W. W. Norton &amp; Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0...   
1  http://images.amazon.com/

In [3]:
print(users.head())

   User-ID                            Location   Age
0        1                  nyc, new york, usa   NaN
1        2           stockton, california, usa  18.0
2        3     moscow, yukon territory, russia   NaN
3        4           porto, v.n.gaia, portugal  17.0
4        5  farnborough, hants, united kingdom   NaN


In [4]:
print(ratings.head())

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [5]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
books.rename(columns = {'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)

In [6]:
users.rename(columns = {'User-ID':'user_id', 'Location':'location', 'Age':'age'}, inplace=True)
ratings.rename(columns = {'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)

In [7]:
books.head()

Unnamed: 0,ISBN,title,author,year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [8]:
ratings['user_id'].value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: user_id, Length: 105283, dtype: int64

# Extract users and ratings of more than 200

In [9]:
x = ratings['user_id'].value_counts() > 200
display(x)

11676      True
198711     True
153662     True
98391      True
35859      True
          ...  
116180    False
116166    False
116154    False
116137    False
276723    False
Name: user_id, Length: 105283, dtype: bool

In [10]:
y = x[x].index
print(y.shape)

(899,)


In [11]:
y

Int64Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352,
            110973, 235105,
            ...
            260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727,
            268622, 188951],
           dtype='int64', length=899)

In [12]:
rating_with_books = ratings.merge(books, on='ISBN')
rating_with_books.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


In [None]:
#Extract books that have received more than 50 ratings.

In [13]:
number_rating = rating_with_books.groupby('title')['rating'].count().reset_index()
number_rating.rename(columns= {'rating':'number_of_ratings'}, inplace=True)
number_rating

Unnamed: 0,title,number_of_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [14]:
final_rating = rating_with_books.merge(number_rating, on='title')
final_rating.shape

(1031136, 8)

In [15]:
final_rating = final_rating[final_rating['number_of_ratings'] >= 50]
final_rating.drop_duplicates(['user_id','title'], inplace=True)

In [16]:
book_pivot = final_rating.pivot_table(columns='user_id', index='title', values="rating")
book_pivot.fillna(0, inplace=True)

# Modeling 
# KNN algorithm (K-nearest neighbors)

In [17]:
from scipy.sparse import csr_matrix
book_sparse = csr_matrix(book_pivot)

In [18]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

NearestNeighbors(algorithm='brute')

In [21]:
distances, suggestions = model.kneighbors(book_pivot.iloc[297, :].values.reshape(1, -1))

In [22]:
for i in range(len(suggestions)):
  print(book_pivot.index[suggestions[i]])

Index(['Book Club', 'Ground Zero and Beyond', 'The Blooding',
       'A Secret Affair', 'Reasonable Doubt'],
      dtype='object', name='title')


create a function to .. paas a book name and recommendations of similar book

In [23]:
# Import libraries
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [24]:
# Create a sparse matrix from the book pivot table
book_sparse = csr_matrix(book_pivot)

In [25]:
# Create a nearest neighbors model with brute force algorithm
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

NearestNeighbors(algorithm='brute')

In [30]:
def recommend_book(book_name):
  # Find the index of the book in the pivot table
  book_index = book_pivot.index.get_loc(book_name)
  # Get the distances and suggestions from the model
  distances, suggestions = model.kneighbors(book_pivot.iloc[book_index, :].values.reshape(1, -1))
  # Create an empty list to store the recommendations
  recommendations = []
  # Loop through the suggestions and append the book names to the list
  for i in range(len(suggestions)):
    recommendations.append(book_pivot.index[suggestions[i]])
  # Return the list of recommendations
  return recommendations

In [35]:
# Test the function with an example book name
book_name = "Book Club"
recommendations = recommend_book(book_name)
print(f"Books similar to {book_name} are:")
for book in recommendations:
  print(book)

Books similar to Book Club are:
Index(['Book Club', 'Ground Zero and Beyond', 'The Blooding',
       'A Secret Affair', 'Reasonable Doubt'],
      dtype='object', name='title')
