In [22]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import pickle

In [2]:
books = pd.read_csv("C:/Users/Dell/Downloads/book-reviews/Books.csv")
ratings = pd.read_csv("C:/Users/Dell/Downloads/book-reviews/Ratings.csv")
users = pd.read_csv("C:/Users/Dell/Downloads/book-reviews/Users.csv")

  books = pd.read_csv("C:/Users/Dell/Downloads/book-reviews/Books.csv")


## Understanding the dataset

In [3]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [7]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [8]:
books.duplicated().sum()

0

## Reducing the pool of rating to choose from to get accurate results

In [9]:
columns_to_drop = ['Year-Of-Publication', 'Publisher', 'Image-URL-S','Image-URL-L']
book_with_ratings = ratings.merge(books, on='ISBN').drop(columns_to_drop, axis=1)
book_with_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Image-URL-M
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,http://images.amazon.com/images/P/034545104X.0...


In [10]:
# Filtering out users who have given a lot of reviews to find best recommendations
frequent_reviewers_bool = book_with_ratings.groupby('User-ID').count()['Book-Rating'] > 100
well_read_users = frequent_reviewers_bool[frequent_reviewers_bool].index
print(well_read_users.shape)

(1648,)


In [11]:
# Books that have been rated by well read users
well_reviewed_books = book_with_ratings[book_with_ratings['User-ID'].isin(well_read_users)]

In [12]:
# Books that have had more than 100 well-read users review them
book_review_counts = well_reviewed_books.groupby('Book-Title').count()['Book-Rating'] >= 70
well_rated_books = well_reviewed_books[well_reviewed_books['Book-Title'].isin(book_review_counts[book_review_counts].index)]
well_rated_books.shape

(70124, 6)

## Book recommendation function

In [13]:
books_pt = well_rated_books.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating').fillna(0)
books_pt

User-ID,254,507,882,1424,1435,1733,1903,2033,2110,2276,...,275020,275970,276463,276680,277427,277478,277639,278137,278188,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wizard and Glass (The Dark Tower, Book 4)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wuthering Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
books_matrix = csr_matrix(books_pt.values)

In [15]:
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(books_matrix)

In [16]:
def get_recs(book=""):
    # Ensure the book exists in books_pt
    if book not in books_pt.index:
        print(f"Error: '{book}' not found in dataset.")
        return []

    # Store recommendations
    rec_books = []

    # Get nearest neighbors
    distance, book_info = knn.kneighbors([books_pt.loc[book]])

    # Extract recommended books and distances, excluding the first one (which is the input book itself)
    recom_book_info = books_pt.iloc[book_info[0][1:]].index.to_list()
    
    data = []
    # Append book names with distances
    for r in zip(recom_book_info):
        item = []
        temp_df = books[books['Book-Title'] == r[0]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)

    return data

In [17]:
print(get_recs('1984'))

[['Animal Farm', 'George Orwell', 'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'], ['Brave New World', 'Aldous Huxley', 'http://images.amazon.com/images/P/0060809833.01.MZZZZZZZ.jpg'], ["The Handmaid's Tale", 'Margaret Atwood', 'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'], ['The Catcher in the Rye', 'J.D. Salinger', 'http://images.amazon.com/images/P/0316769487.01.MZZZZZZZ.jpg'], ['Lord of the Flies', 'William Gerald Golding', 'http://images.amazon.com/images/P/0399501487.01.MZZZZZZZ.jpg']]


## Finding the top 20 books (for displaying)

In [18]:
temp_ratings = well_rated_books.groupby('Book-Title', as_index=False).agg({'Book-Rating': 'mean'})
temp_ratings.rename(columns={'Book-Rating': 'avg-rating'}, inplace=True)

avg_of_ratings = well_rated_books.drop(columns=['Book-Rating']).merge(temp_ratings, on='Book-Title').sort_values('avg-rating', ascending=False)
avg_of_ratings = avg_of_ratings.drop_duplicates(subset="Book-Title", keep="first")
avg_of_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Title,Book-Author,Image-URL-M,avg-rating
20016,11676,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,5.451613
19933,185176,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,5.03125
43664,25409,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,4.971098
1432,85993,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,4.644737
4826,39616,0439064864,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064864.0...,4.526104


In [19]:
top_20_books = avg_of_ratings.head(20)
top_20_books.head()

Unnamed: 0,User-ID,ISBN,Book-Title,Book-Author,Image-URL-M,avg-rating
20016,11676,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,5.451613
19933,185176,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,5.03125
43664,25409,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,4.971098
1432,85993,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,4.644737
4826,39616,0439064864,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064864.0...,4.526104


In [20]:
pickle.dump(top_20_books, open('top-20-books.pkl', 'wb'))

In [21]:
pickle.dump(books_pt, open('books_pt.pkl', 'wb'))
pickle.dump(books, open('books.pkl', 'wb'))
pickle.dump(knn, open('knn.pkl', 'wb'))