# Book Recommendation System

In [169]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity


**Dataset Loding**

In [170]:

books= pd.read_csv("/kaggle/input/bookcrossing-dataset/Book reviews/Book reviews/BX_Books.csv", sep=";", on_bad_lines='skip',encoding='latin-1')

ratings= pd.read_csv("/kaggle/input/bookcrossing-dataset/Book reviews/Book reviews/BX-Book-Ratings.csv", sep=";", on_bad_lines='skip',encoding='latin-1')

users= pd.read_csv("/kaggle/input/bookcrossing-dataset/Book reviews/Book reviews/BX-Users.csv", sep=";", on_bad_lines='skip',encoding='latin-1')


**Book Data**

In [171]:
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [172]:
#dropping unnessery cols
books=books.drop(['Publisher','Year-Of-Publication','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)

books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este


**Rating Data**


In [173]:
ratings.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [174]:
users.head(3)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [175]:
print(books.shape)
print(ratings.shape)
print(users.shape)


(271379, 3)
(1149780, 3)
(278858, 3)


In [176]:
"""Checking for Null values"""

print("NULL values in Books : \n",books.isnull().sum())
print("\nNULL values in Users : \n",users.isnull().sum())
print("\nNULL values in Ratings : \n",ratings.isnull().sum())


NULL values in Books : 
 ISBN           0
Book-Title     0
Book-Author    2
dtype: int64

NULL values in Users : 
 User-ID          0
Location         0
Age         110762
dtype: int64

NULL values in Ratings : 
 User-ID        0
ISBN           0
Book-Rating    0
dtype: int64


In [177]:
"""Checking for Duplicate values"""

print("Number of duplicate values in Books: ",books.duplicated().sum())
print("\nNumber of duplicate values in Rating: ",ratings.duplicated().sum())
print("\nNumber of duplicate values in Users: ",users.duplicated().sum())

Number of duplicate values in Books:  0

Number of duplicate values in Rating:  0

Number of duplicate values in Users:  0


In [178]:
#merged ratings with books
book_rate = ratings.merge(books,on='ISBN')
book_rate.head(10)


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose
1,276726,0155061224,5,Rites of Passage,Judith Rae
2,276727,0446520802,0,The Notebook,Nicholas Sparks
3,276729,052165615X,3,Help!: Level 1,Philip Prowse
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather
5,276733,2080674722,0,Les Particules Elementaires,Michel Houellebecq
6,276744,038550120X,7,A Painted House,JOHN GRISHAM
7,276746,0425115801,0,Lightning,Dean R. Koontz
8,276746,0449006522,0,Manhattan Hunt Club,JOHN SAUL
9,276746,0553561618,0,Dark Paradise,TAMI HOAG


In [179]:
book_rate.shape

(1031175, 5)

In [180]:

# Filtering rows where Book-Rating is not equal to 0
book_rate = book_rate[book_rate['Book-Rating'] != 0]

book_rate.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
1,276726,0155061224,5,Rites of Passage,Judith Rae
3,276729,052165615X,3,Help!: Level 1,Philip Prowse
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather
6,276744,038550120X,7,A Painted House,JOHN GRISHAM
13,276747,0060517794,9,Little Altars Everywhere,Rebecca Wells


In [181]:
book_rate.shape

(383852, 5)

# KNN Colaborative

## we are gonna perform collaborative filtering using k-NN to identify user clusters using common book ratings and employ the top k-nearest neighbor average rating to predict outcomes.

In [182]:
book_rate.head(5) 

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
1,276726,0155061224,5,Rites of Passage,Judith Rae
3,276729,052165615X,3,Help!: Level 1,Philip Prowse
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather
6,276744,038550120X,7,A Painted House,JOHN GRISHAM
13,276747,0060517794,9,Little Altars Everywhere,Rebecca Wells


In [182]:
book_rate.head(5) 

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
1,276726,0155061224,5,Rites of Passage,Judith Rae
3,276729,052165615X,3,Help!: Level 1,Philip Prowse
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather
6,276744,038550120X,7,A Painted House,JOHN GRISHAM
13,276747,0060517794,9,Little Altars Everywhere,Rebecca Wells


## filtering

In [183]:
# Filter on the basis of users who have rated more than 200 books

x = book_rate.groupby('User-ID').count()['Book-Rating'] > 200
active_raters = x[x].index
active_users = book_rate[book_rate['User-ID'].isin(active_raters)]

In [184]:
# checkout the reselt
print("Number of active users:", len(active_users))


Number of active users: 58176


In [185]:
# Filter on the basis of books that have received at least 15 ratings

y = active_users.groupby('Book-Title').count()['Book-Rating'] >= 15
famous_books = y[y].index
final_ratings = active_users[active_users['Book-Title'].isin(famous_books)]


In [186]:
# result
print("Number of famous books:", len(final_ratings))


Number of famous books: 756


In [187]:
#creating user book matrix

pt = final_ratings.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating')

In [188]:
pt.fillna(0,inplace=True)
pt.head(5)


Book-Title,1st to Die: A Novel,2nd Chance,A Time to Kill,Black House,Bridget Jones's Diary,Dolores Claiborne,Dreamcatcher,Harry Potter and the Chamber of Secrets (Book 2),Harry Potter and the Goblet of Fire (Book 4),Harry Potter and the Order of the Phoenix (Book 5),...,The Notebook,The Partner,The Pelican Brief,The Red Tent (Bestselling Backlist),The Secret Garden,The Secret Life of Bees,The Testament,The Tommyknockers,To Kill a Mockingbird,Watership Down
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,9.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
6575,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,0.0,8.0,0.0,0.0,0.0,4.0
7346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
11676,9.0,7.5,0.0,3.0,6.333333,8.0,8.666667,9.333333,9.0,0.0,...,8.666667,10.0,9.0,10.0,9.666667,9.0,7.0,7.5,9.0,5.0
13552,0.0,0.0,9.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,9.0,0.0,0.0,0.0,8.0,10.0,0.0,0.0


In [189]:
# Calculate cosine similarity between users
user_similarity = cosine_similarity(pt)

In [190]:
# Create a Nearest Neighbors model using cosine similarity
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
knn_model.fit(user_similarity)

In [191]:
# Function to get k-nearest neighbors for a given user
def get_knn(user_id, k=10):
    try:
        user_index = pt.index.get_loc(user_id)
    except KeyError:
        print(f"User ID {user_id} not found.")
        return None
    distances, indices = knn_model.kneighbors([user_similarity[user_index]])
    top_indices = indices.flatten()
    top_distances = distances.flatten()
    exclude_index = top_indices == user_index
    top_indices = top_indices[~exclude_index]
    top_distances = top_distances[~exclude_index]

    # Calculate average rating of the top k-nearest neighbors
    top_ratings = pt.iloc[top_indices].mean(axis=0)
    top_ratings = top_ratings.replace(0, float('nan'))
    top_ratings = top_ratings.dropna()
    top_ratings = top_ratings.sort_values(ascending=False)
    top_k_ratings = top_ratings.head(k)

    user_cluster = pt.loc[user_id].name
    cluster_members = pt[pt.index == user_cluster].index

    return top_k_ratings, user_cluster, cluster_members



In [192]:
# input
top_k_ratings, user_cluster, cluster_members = get_knn(11676, k=10)
print(f"User Cluster: {user_cluster}")
print(f"User IDs in the same cluster: {cluster_members}")
print(f"Top 10 nearest neighbor average ratings:\n{top_k_ratings}")


User Cluster: 11676
User IDs in the same cluster: Index([11676], dtype='int64', name='User-ID')
Top 10 nearest neighbor average ratings:
Book-Title
1st to Die: A Novel                                                 5.222222
Harry Potter and the Goblet of Fire (Book 4)                        4.333333
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))    4.333333
Lightning                                                           4.166667
It                                                                  4.000000
The Pelican Brief                                                   3.888889
A Time to Kill                                                      3.777778
Harry Potter and the Prisoner of Azkaban (Book 3)                   3.777778
Seven Up (A Stephanie Plum Novel)                                   3.666667
The Lovely Bones: A Novel                                           3.555556
dtype: float64


## We will a user id as input and show the top-10 recommended books for the user.

In [193]:
def get_top_recommendations(user_id, k=10):
    try:
        user_index = pt.index.get_loc(user_id)
    except KeyError:
        print(f"User ID {user_id} not found.")
        return None

    distances, indices = knn_model.kneighbors([user_similarity[user_index]])
    top_indices = indices.flatten()
    top_distances = distances.flatten()

    exclude_index = top_indices == user_index
    top_indices = top_indices[~exclude_index]
    top_distances = top_distances[~exclude_index]

    top_ratings = pt.iloc[top_indices].mean(axis=0)
    top_ratings = top_ratings.replace(0, float('nan'))
    top_ratings = top_ratings.dropna()


    top_ratings = top_ratings.sort_values(ascending=False)


    top_k_ratings = top_ratings.head(k)

    return top_k_ratings

In [194]:
#  Input:
user_id = 11676
top_recommendations = get_top_recommendations(user_id)
print(f"Top 10 recommended books for User ID {user_id}:\n{top_recommendations}")

Top 10 recommended books for User ID 11676:
Book-Title
1st to Die: A Novel                                                 5.222222
Harry Potter and the Goblet of Fire (Book 4)                        4.333333
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))    4.333333
Lightning                                                           4.166667
It                                                                  4.000000
The Pelican Brief                                                   3.888889
A Time to Kill                                                      3.777778
Harry Potter and the Prisoner of Azkaban (Book 3)                   3.777778
Seven Up (A Stephanie Plum Novel)                                   3.666667
The Lovely Bones: A Novel                                           3.555556
dtype: float64


## We will take user id and ISBN as input and show the probable rating of that book by that user.


In [195]:
# Creating another pivot
user_book_matrix = final_ratings.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating')

In [196]:
user_book_matrix.head(5)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


ISBN,0060935464,0061083259,006440188X,0099245027,0140039589,014028009X,0140867155,0141000198,0142001740,030703979X,...,0848807804,0850919649,0879236493,0899668585,093798650X,0937986917,1565116674,1569874115,1852865024,1896095011
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6251,,,,,,,,,,,...,,,,,,,,,,
6575,,,,,,4.0,,4.0,8.0,,...,,,,,,,,,,
7346,,,,,,,,,,,...,,,,,,,,,,
11676,10.0,,10.0,4.0,,8.0,,,,,...,,,9.0,8.0,,,,,,
13552,,,,,,,,9.0,,,...,,,,,,,,,,


In [197]:
def get_probable_rating(user_id, isbn):
    try:
        user_index = user_book_matrix.index.get_loc(user_id)
    except KeyError:
        print(f"User ID {user_id} not found.")
        return None

    try:
        book_index = user_book_matrix.columns.get_loc(isbn)
    except KeyError:
        print(f"ISBN {isbn} not found.")
        return None

    distances, indices = knn_model.kneighbors([user_similarity[user_index]])
    top_indices = indices.flatten()
    top_distances = distances.flatten()

    exclude_index = top_indices == user_index
    top_indices = top_indices[~exclude_index]
    top_distances = top_distances[~exclude_index]

    probable_rating = user_book_matrix.iloc[top_indices, book_index].mean()

    return probable_rating




In [198]:
# Input
user_id = 11676
isbn = '014028009X'
probable_rating = get_probable_rating(user_id, isbn)

if probable_rating is not None:
    print(f"Probable rating for User ID {user_id} on book with ISBN {isbn}: {probable_rating}")


Probable rating for User ID 11676 on book with ISBN 014028009X: 7.666666666666667


# The End