In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle
from scipy.sparse.linalg import svds

In [2]:
# Load the books dataset
books = pd.read_csv("BX-Books.csv", sep=';', on_bad_lines='skip', encoding='latin-1')
books.head()  # Display the first few rows of the books dataframe

  books = pd.read_csv("BX-Books.csv", sep=';', on_bad_lines='skip', encoding='latin-1')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
# Show the columns of the books dataframe
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [4]:
# Select relevant columns
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
books.head(3)  # Display the first 3 rows of the updated books dataframe

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial


In [5]:
# Rename columns for better readability
books.rename(columns={'Book-Title': 'title', 'Book-Author': 'author', 'Year-Of-Publication': 'year', 'Publisher': 'publisher'}, inplace=True)
books.head()  # Display the first few rows of the renamed books dataframe

Unnamed: 0,ISBN,title,author,year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [6]:
# Load the users dataset
users = pd.read_csv("BX-Users.csv", sep=';', on_bad_lines='skip', encoding='latin-1')
users.head()  # Display the first few rows of the users dataframe

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [7]:
# Rename columns for better readability
users.rename(columns={'	User-ID': 'user_id', 'Location': 'location', 'Age': 'age'}, inplace=True)
users.head()  # Display the first few rows of the renamed users dataframe

Unnamed: 0,User-ID,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [8]:
# Load the ratings dataset
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', on_bad_lines='skip', encoding='latin-1')
ratings.head()  # Display the first few rows of the ratings dataframe

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
# Rename columns for better readability
ratings.rename(columns={'User-ID': 'user_id', 'Book-Rating': 'rating'}, inplace=True)
ratings.head()  # Display the first few rows of the renamed ratings dataframe

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [10]:
# Display the shape of the dataframes
books.shape

(271360, 5)

In [11]:
users.shape

(278858, 3)

In [12]:
ratings.shape

(1149780, 3)

In [13]:
# Display the number of unique users
ratings['user_id'].value_counts().shape

(105283,)

In [14]:
# Filter users with more than 200 ratings
x = ratings['user_id'].value_counts() > 200
x.shape

(105283,)

In [15]:
# Display the number of users with more than 200 ratings
x[x].shape

(899,)

In [16]:
# Get the user IDs of these users
y = x[x].index
y.shape

(899,)

In [17]:
# Filter the ratings dataframe to include only these users
ratings = ratings[ratings['user_id'].isin(y)]
ratings.shape

(526356, 3)

In [18]:
ratings.head()  # Display the first few rows of the filtered ratings dataframe

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [19]:
# Merge ratings with books data
ratings_with_books = ratings.merge(books, on='ISBN')
ratings_with_books.head()  # Display the first few rows of the merged dataframe

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc


In [20]:
ratings_with_books.shape

(487671, 7)

In [21]:
# Group by title and count the number of ratings
ratings_with_books.groupby('title')['rating'].count()

title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)    2
 Always Have Popsicles                                                                                        1
 Apple Magic (The Collector's series)                                                                         1
 Beyond IBM: Leadership Marketing and Finance for the 1990s                                                   1
 Clifford Visita El Hospital (Clifford El Gran Perro Colorado)                                                1
                                                                                                             ..
Ã?Â?ber die Pflicht zum Ungehorsam gegen den Staat.                                                           3
Ã?Â?lpiraten.                                                                                                 1
Ã?Â?rger mit Produkt X. Roman.                                                                    

In [22]:
# Create a new dataframe with the number of ratings per book
number_rating = ratings_with_books.groupby('title')['rating'].count().reset_index()
number_rating.head()  # Display the first few rows of the number_rating dataframe

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [23]:
# Rename the column for better readability
number_rating.rename(columns={'rating': 'number of ratings'}, inplace=True)
number_rating

Unnamed: 0,title,number of ratings
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1
...,...,...
160264,Ã?Â?ber die Pflicht zum Ungehorsam gegen den S...,3
160265,Ã?Â?lpiraten.,1
160266,Ã?Â?rger mit Produkt X. Roman.,1
160267,Ã?Â?stlich der Berge.,1


In [24]:
# Merge the number of ratings with the ratings_with_books dataframe
final_rating = ratings_with_books.merge(number_rating, on='title')
final_rating

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,number of ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
...,...,...,...,...,...,...,...,...
487666,275970,1892145022,0,Here Is New York,E. B. White,1999,Little Bookroom,1
487667,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias,2002,Capital Books (VA),1
487668,275970,3411086211,10,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt,1993,"Bibliographisches Institut, Mannheim",1
487669,275970,3829021860,0,The Penis Book,Joseph Cohen,1999,Konemann,1


In [25]:
final_rating.shape

(487671, 8)

In [26]:
# Filter books with at least 50 ratings
final_rating = final_rating[final_rating['number of ratings'] >= 50]
final_rating.shape

(61853, 8)

In [27]:
# Drop duplicate user-book pairs
final_rating.drop_duplicates(['user_id', 'title'], inplace=True)
final_rating

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_rating.drop_duplicates(['user_id', 'title'], inplace=True)


Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,number of ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
...,...,...,...,...,...,...,...,...
236701,255489,0553579983,7,And Then You Die,Iris Johansen,1998,Bantam,50
236702,256407,0553579983,0,And Then You Die,Iris Johansen,1998,Bantam,50
236703,257204,0553579983,0,And Then You Die,Iris Johansen,1998,Bantam,50
236704,261829,0553579983,0,And Then You Die,Iris Johansen,1998,Bantam,50


In [28]:
# Create a pivot table with users as columns and titles as rows
book_pivot = final_rating.pivot_table(columns='user_id', index='title', values='rating')
book_pivot

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,7.0,,...,,,,,,0.0,,,,
You Belong To Me,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,,,0.0,...,,,,,,0.0,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [29]:
book_pivot.shape

(742, 888)

In [30]:
# Fill NaN values with 0
book_pivot = book_pivot.fillna(0)
book_pivot.head(10)  # Display the first 10 rows of the pivot table

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Case of Need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
"A Child Called \It\"": One Child's Courage to Survive""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Civil Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Cry In The Night,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Convert pivot table to a sparse matrix
book_sparse = csr_matrix(book_pivot)
type(book_sparse)  # Display the type of the sparse matrix

scipy.sparse._csr.csr_matrix

In [32]:
# Import necessary function for RMSE calculation
from sklearn.metrics import mean_squared_error

In [33]:
# Perform matrix factorization using Singular Value Decomposition (SVD)
U, sigma, Vt = svds(book_sparse, k=10)

In [34]:
# Reconstruct the matrix using the factorized components
reconstructed_matrix = np.dot(np.dot(U, np.diag(sigma)), Vt)

In [35]:
# Calculate the RMSE between the original matrix and the reconstructed matrix
rmse = np.sqrt(mean_squared_error(book_sparse.toarray(), reconstructed_matrix))
print("RMSE:", rmse)

RMSE: 1.0958751944031295


In [36]:
# Train a k-NN model using the book_sparse matrix
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [37]:
def calculate_rmse(predicted_ratings, actual_ratings):
    # Flatten the predicted and actual ratings arrays
    predicted_ratings = predicted_ratings.flatten()
    actual_ratings = actual_ratings.flatten()
    
    # Calculate the squared difference between predicted and actual ratings
    squared_diff = np.square(predicted_ratings - actual_ratings)
    
    # Calculate the mean squared difference
    mean_squared_diff = np.mean(squared_diff)
    
    # Calculate the root mean squared error (RMSE)
    rmse = np.sqrt(mean_squared_diff)
    
    return rmse

In [38]:
# Example of calculating RMSE with sample data
actual_ratings = np.array([3, 4, 5, 2, 3])  # Actual ratings from the test set
predicted_ratings = np.array([2.8, 3.9, 4.7, 2.1, 3.2])  # Predicted ratings from the model
rmse = calculate_rmse(predicted_ratings, actual_ratings)
print("RMSE:", rmse)

RMSE: 0.1949358868961793


In [39]:
# Find the index of the book and get suggestions
np.where(book_pivot.index == 237)
distances, suggestions = model.kneighbors(book_pivot.iloc[237, :].values.reshape(1, -1), n_neighbors=6)
distances

array([[ 0.        , 68.78953409, 69.5413546 , 72.64296249, 76.83098333,
        77.28518616]])

In [40]:
suggestions

array([[237, 240, 238, 241, 184, 536]], dtype=int64)

In [41]:
# Print book recommendations
for i in range(len(suggestions)):
    print(book_pivot.index[suggestions[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [42]:
# Find the index of another book and get suggestions
book_pivot.index[237]

'Harry Potter and the Chamber of Secrets (Book 2)'

In [43]:
book_pivot.index[240]

'Harry Potter and the Prisoner of Azkaban (Book 3)'

In [44]:
distances, suggestions = model.kneighbors(book_pivot.iloc[240, :].values.reshape(1, -1), n_neighbors=6)
for i in range(len(suggestions)):
    print(book_pivot.index[suggestions[i]])

Index(['Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Sorcerer's Stone (Book 1)',
       'Harry Potter and the Order of the Phoenix (Book 5)', 'Exclusive'],
      dtype='object', name='title')


In [45]:
# Another example with a different book
book_pivot.index[54]
distances, suggestions = model.kneighbors(book_pivot.iloc[54, :].values.reshape(1, -1), n_neighbors=6)
for i in range(len(suggestions)):
    print(book_pivot.index[suggestions[i]])

Index(['Animal Farm', 'Exclusive', 'Jacob Have I Loved', 'Second Nature',
       'Pleading Guilty', 'No Safe Place'],
      dtype='object', name='title')


In [46]:
# Example of finding a book by title
book_pivot.index == 'Animal Farm'
np.where(book_pivot.index == 'Animal Farm')

(array([54], dtype=int64),)

In [47]:
# Function to recommend books based on a given book title
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distances, suggestions = model.kneighbors(book_pivot.iloc[book_id, :].values.reshape(1, -1), n_neighbors=6)
    
    for i in range(len(suggestions)):
        if i == 0:
            print(f'The suggestions are: ')
        if not i:
            print(book_pivot.index[suggestions[i]])

In [48]:
# Examples of recommending books
recommend_book('Harry Potter and the Prisoner of Azkaban (Book 3)')

The suggestions are: 
Index(['Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Sorcerer's Stone (Book 1)',
       'Harry Potter and the Order of the Phoenix (Book 5)', 'Exclusive'],
      dtype='object', name='title')


In [49]:
recommend_book('A Cry In The Night')

The suggestions are: 
Index(['A Cry In The Night', 'Exclusive', 'No Safe Place',
       'Deck the Halls (Holiday Classics)', 'The Cradle Will Fall',
       'Long After Midnight'],
      dtype='object', name='title')


In [50]:
# Save the model to a file
with open('book_recommendation_model.pkl', 'wb') as f:
    pickle.dump(model, f)
