In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
users_df = pd.read_csv('Users.csv')
books_df = pd.read_csv('Books.csv')
ratings_df = pd.read_csv('Ratings.csv')

  books_df = pd.read_csv('Books.csv')


In [3]:
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
users_df.isna().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [5]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [6]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [7]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [8]:
users_df['Location']

0                         nyc, new york, usa
1                  stockton, california, usa
2            moscow, yukon territory, russia
3                  porto, v.n.gaia, portugal
4         farnborough, hants, united kingdom
                         ...                
278853                 portland, oregon, usa
278854    tacoma, washington, united kingdom
278855             brampton, ontario, canada
278856             knoxville, tennessee, usa
278857                  dublin, n/a, ireland
Name: Location, Length: 278858, dtype: object

In [9]:
#Splitting the location into city, state and country 
users_df[['City', 'State', 'Country']] = users_df['Location'].str.split(',', expand = True)[[0,1,2]]

In [10]:
users_df.head(2)

Unnamed: 0,User-ID,Location,Age,City,State,Country
0,1,"nyc, new york, usa",,nyc,new york,usa
1,2,"stockton, california, usa",18.0,stockton,california,usa


In [11]:
#Dropping the duplicates in the books df on book titles 
books_df = books_df.drop_duplicates(subset = "Book-Title", keep="first")

In [12]:
#Merging ratings and their respective books
ratings_books_df = ratings_df.merge(books_df, on='ISBN')

In [13]:
ratings_books_df.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [14]:
ratings_books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 883079 entries, 0 to 883078
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   User-ID              883079 non-null  int64 
 1   ISBN                 883079 non-null  object
 2   Book-Rating          883079 non-null  int64 
 3   Book-Title           883079 non-null  object
 4   Book-Author          883078 non-null  object
 5   Year-Of-Publication  883079 non-null  object
 6   Publisher            883078 non-null  object
 7   Image-URL-S          883079 non-null  object
 8   Image-URL-M          883079 non-null  object
 9   Image-URL-L          883075 non-null  object
dtypes: int64(2), object(8)
memory usage: 74.1+ MB


In [15]:
#Drop unnecessary columns
ratings_books_df.drop(['ISBN', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis = 1, inplace = True)

In [16]:
ratings_books_df.shape

(883079, 6)

In [17]:
ratings_books_df.head(2)

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


In [18]:
#Merge user details with the ratings and book details
users_ratings_books_df = ratings_books_df.merge(users_df, on='User-ID')

In [19]:
users_ratings_books_df.head(2)

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location,Age,City,State,Country
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",,tyler,texas,usa
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0,cincinnati,ohio,usa


In [20]:
#Dropping unnecessary columns
users_ratings_books_df.drop(['Location', 'Age', 'City','State', 'Country'], axis = 1, inplace = True)

In [21]:
users_ratings_books_df.head(2)

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


In [22]:
users_ratings_books_df.shape

(883079, 6)

In [23]:
users_ratings_books_df.isna().sum()

User-ID                0
Book-Rating            0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              1
dtype: int64

In [24]:
#Dropping na rows 
users_ratings_books_df.dropna(inplace = True)

In [25]:
users_ratings_books_df.shape

(883077, 6)

In [26]:
book_num_ratings = users_ratings_books_df.groupby('Book-Title')['Book-Rating'].count().reset_index().rename(columns = {'Book-Rating':'Num-Ratings' })
book_avg_ratings = users_ratings_books_df.groupby('Book-Title')['Book-Rating'].mean().reset_index().rename(columns = {'Book-Rating':'Avg-Ratings' })
final_user_ratings = book_num_ratings.merge(book_avg_ratings , on = 'Book-Title')

In [27]:
final_user_ratings.head(2)

Unnamed: 0,Book-Title,Num-Ratings,Avg-Ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0


# Popularity based filtering

In [28]:
#Filtered out top 50 books with more than 250 ratings
popular_books = final_user_ratings[final_user_ratings['Num-Ratings'] > 250].sort_values(by = 'Avg-Ratings'  , ascending= False).reset_index(drop = True).head(50)

In [29]:
popular_books.head(2)

Unnamed: 0,Book-Title,Num-Ratings,Avg-Ratings
0,Harry Potter and the Order of the Phoenix (Boo...,334,5.571856
1,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117


# Collaborative Filtering Method

In [30]:
#Users who have rated more than 200 books
x = users_ratings_books_df.groupby('User-ID').count()['Book-Rating'] > 200
well_read_users  = x[x].index

book_ratings = users_ratings_books_df[users_ratings_books_df['User-ID'].isin(well_read_users)]

#books with atleast 50 ratings 
y  = users_ratings_books_df.groupby('Book-Title')['Book-Rating'].count() >= 50
popular_books = y[y].index

final = users_ratings_books_df[users_ratings_books_df['Book-Title'].isin(popular_books)]

In [31]:
final.head(2)

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


In [32]:
final.shape

(194250, 6)

In [33]:
final_df = final.pivot_table(index="Book-Title",columns = "User-ID", values="Book-Rating").fillna(0)

In [34]:
final_df

User-ID,9,14,16,17,26,32,39,42,44,51,...,278807,278813,278819,278828,278832,278836,278843,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
204 Rosewood Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\O\"" Is for Outlaw""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
similarity_scores = cosine_similarity(final_df)

def recommend(book_name):
    # Find the index of the given book in the DataFrame
    book_index = np.where(final_df.index == book_name)[0][0]
    
    # Get similarity scores for the given book and sort them in descending order
    # Exclude the book itself by slicing [1:6] for top 5 similar books
    similar_books = sorted(
        enumerate(similarity_scores[book_index]), 
        key=lambda x: x[1], 
        reverse=True
    )[1:6]
    
    # Print the names of the similar books
    for book in similar_books:
        print(final_df.index[book[0]])

In [36]:
recommend("stardust")

American Gods
The Mistress of Spices
The Reptile Room (A Series of Unfortunate Events, Book 2)
American Psycho (Vintage Contemporaries)
Neverwhere


In [37]:
recommend("204 Rosewood Lane")

Girls Night
16 Lighthouse Road
Thursday'S At Eight
Cold Blooded
Dark Water (Mira Romantic Suspense)


# Collaborative filtering using KNN Machine Learning model

K-Nearest Neighbors (KNN) Model for Book Recommendations:
The K-Nearest Neighbors (KNN) model is used for both classification and regression tasks. For this task, I have used KNN to recommend books that are similar to a given book based on the proximity of features in a multi-dimensional space.

KNN works by calculating the "distance" between data points. In the context of books, the distance represents how similar two books are based on their features, such as genre, author, ratings, etc.
The model identifies the k nearest neighbors (similar books) to the given book. The k value specifies how many neighbors to consider for making the recommendation. In this case, n_neighbors=6 means that it considers 6 books, including the input book itself.

Recommendation Process:

When a book is selected, the model calculates the distance between the selected book and all other books in the dataset.
It then returns the k nearest neighbors (the most similar books) based on these calculated distances.

In [38]:
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from scipy.sparse import csr_matrix

final_dataset = final_df.copy()
final_dataset_sparse = csr_matrix(final_dataset)
knn_model = NearestNeighbors(metric='cosine',algorithm='brute')
knn_model.fit(final_dataset_sparse)

NearestNeighbors(algorithm='brute', metric='cosine')

In [39]:
def recommend_knn(book_name):
    # Get the distances and suggestions for the given book from the KNN model
    dist, sugg = knn_model.kneighbors(final_dataset[final_dataset.index == book_name], n_neighbors=6)
    
    # Print the book recommendation heading
    print(f'Book Recommendations for "{book_name}":')
    
    # Loop through the suggestions (skipping the first one, as it’s the book itself)
    for i in range(1, len(sugg[0])):
        recommended_book = final_dataset.index[sugg[0][i]]
        print(f'{i}. {recommended_book}')

In [40]:
recommend_knn('Message in a Bottle')

Book Recommendations for "Message in a Bottle":
1. Nights in Rodanthe
2. Cause Celeb
3. The Playboy
4. The Loop: A Novel
5. The Guardian
