#V0

In [None]:
import pandas as pd
import numpy as np

# Load your data
df = pd.read_csv('/content/Ratings.csv')

# Creating the user-item rating matrix
user_item_matrix = df.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', aggfunc='first')

# Store user and item indices to map back to the DataFrame later
user_indices = user_item_matrix.index
item_indices = user_item_matrix.columns

# Replace NaN with 0 for calculation purposes
train_mat = user_item_matrix.fillna(0).values

# Calculate the overall mean rating for non-zero ratings
mu = np.mean(train_mat[train_mat != 0])

# Calculate user biases bu
user_mean_ratings = np.where(train_mat.sum(axis=1) != 0, np.mean(train_mat, axis=1), 0)
bu = user_mean_ratings - mu

# Calculate item biases bi
item_mean_ratings = np.where(train_mat.sum(axis=0) != 0, np.mean(train_mat, axis=0), 0)
bi = item_mean_ratings - mu

# Prediction matrix calculation
prediction_mat = mu + bu[:, np.newaxis] + bi[np.newaxis, :]

# Convert the prediction matrix back to a DataFrame
predicted_ratings_df = pd.DataFrame(prediction_mat, index=user_indices, columns=item_indices)

# Flatten the DataFrame to merge with the original
predicted_ratings_flat = predicted_ratings_df.stack().reset_index()
predicted_ratings_flat.columns = ['User-ID', 'ISBN', 'Predicted-Rating']

# Merge predicted ratings with the original DataFrame
df = df.merge(predicted_ratings_flat, on=['User-ID', 'ISBN'])

# Fill zero ratings in the original DataFrame with predictions
df.loc[df['Book-Rating'] == 0, 'Book-Rating'] = df['Predicted-Rating']

# Drop the Predicted-Rating column as it's no longer needed
df.drop(columns=['Predicted-Rating'], inplace=True)

# Save the updated DataFrame back to a CSV file
df.to_csv('updated_rating.csv', index=False)

print("Updated ratings have been computed and stored.")


  user_item_matrix = df.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', aggfunc='first')


In [1]:
import pandas as pd

# Load your data
ratings_df = pd.read_csv('/content/books_with_rats_moods.csv')

# Calculate the overall mean rating for non-zero ratings
mu = ratings_df[ratings_df['Book-Rating'] != 0]['Book-Rating'].mean()

# Calculate user and item biases
user_biases = ratings_df[ratings_df['Book-Rating'] != 0].groupby('User-ID')['Book-Rating'].mean() - mu
item_biases = ratings_df[ratings_df['Book-Rating'] != 0].groupby('ISBN')['Book-Rating'].mean() - mu

# Map the biases to the original DataFrame
ratings_df['User-Bias'] = ratings_df['User-ID'].map(user_biases).fillna(0)
ratings_df['Item-Bias'] = ratings_df['ISBN'].map(item_biases).fillna(0)

# Apply the baseline estimate for zero ratings
ratings_df.loc[ratings_df['Book-Rating'] == 0, 'Book-Rating'] = mu + ratings_df['User-Bias'] + ratings_df['Item-Bias']

# Clip the ratings to ensure they fall within the typical range (e.g., 1-10)
ratings_df['Book-Rating'] = ratings_df['Book-Rating'].clip(lower=1, upper=10)

# Drop the bias columns as they are no longer needed
ratings_df.drop(['User-Bias', 'Item-Bias'], axis=1, inplace=True)

# Save the updated DataFrame back to a CSV file
ratings_df.to_csv('baseline_ratinsg.csv', index=False)


In [None]:

# Assuming ratings_df is the DataFrame with the ratings

# Filter for user_id 4802
user_4802_ratings = ratings_df[ratings_df['User-ID'] == 23902]

# Print the ratings
print(f"Ratings given by user 23902:")
for index, row in user_4802_ratings.iterrows():
    print(f"ISBN: {row['ISBN']} - Rating: {row['Book-Rating']:.2f}")


In [16]:
import scipy
import math
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

ratings_df.rename(columns = {'User-ID':'user_id' ,'ISBN':'isbn' ,'Book-Rating':'book_rating'},inplace=True)


user_ratings_threshold = 3

filter_users = ratings_df['user_id'].value_counts()
filter_users_list = filter_users[filter_users >= user_ratings_threshold].index.to_list()

df_ratings_top = ratings_df[ratings_df['user_id'].isin(filter_users_list)]

print('Filter: users with at least %d ratings\nNumber of records: %d' % (user_ratings_threshold, len(df_ratings_top)))

Filter: users with at least 3 ratings
Number of records: 30359


In [17]:
book_ratings_threshold_perc = 0.1
book_ratings_threshold = len(df_ratings_top['isbn'].unique()) * book_ratings_threshold_perc

filter_books_list = df_ratings_top['isbn'].value_counts().head(int(book_ratings_threshold)).index.to_list()
df_ratings_top = df_ratings_top[df_ratings_top['isbn'].isin(filter_books_list)]

print('Filter: top %d%% most frequently rated books\nNumber of records: %d' % (book_ratings_threshold_perc*100, len(df_ratings_top)))

Filter: top 10% most frequently rated books
Number of records: 17329


In [9]:
pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162992 sha256=8aedd5a1625003ee09a8a952ba4c2ac271d0c5c4a88f34663f178ee7b4e77a51
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [18]:
from surprise import Dataset, Reader
from surprise import SVD, NMF
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

df=df_ratings_top.copy()
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'isbn', 'book_rating']], reader)

In [19]:
model_svd = SVD()
cv_results_svd = cross_validate(model_svd, data, cv=3)
pd.DataFrame(cv_results_svd).mean()

test_rmse    1.122468
test_mae     0.771616
fit_time     0.208850
test_time    0.032751
dtype: float64

In [20]:
trainset, testset = train_test_split(data, test_size=0.2)

model = SVD(n_factors=80, n_epochs=20, lr_all=0.005, reg_all=0.2)
model.fit(trainset)
predictions = model.test(testset)

In [21]:
df_pred = pd.DataFrame(predictions, columns=['user_id', 'isbn', 'actual_rating', 'pred_rating', 'details'])
df_pred['impossible'] = df_pred['details'].apply(lambda x: x['was_impossible'])
df_pred['pred_rating_round'] = df_pred['pred_rating'].round()
df_pred['abs_err'] = abs(df_pred['pred_rating'] - df_pred['actual_rating'])
df_pred.drop(['details'], axis=1, inplace=True)
df_pred.sample(5)

Unnamed: 0,user_id,isbn,actual_rating,pred_rating,impossible,pred_rating_round,abs_err
1703,92156,014023313X,7.0,7.464487,False,7.0,0.464487
3125,162639,375703055,7.476101,7.528049,False,8.0,0.051948
2492,165268,553268880,8.0,8.172991,False,8.0,0.172991
1131,266650,316969443,8.0,7.742775,False,8.0,0.257225
2153,110440,394820371,10.0,7.729287,False,8.0,2.270713


In [24]:
df_books = ratings_df.copy()
df_books.rename(columns = {'ISBN':'isbn' ,'Book-Title':'book_title'},inplace=True)
df_ext = df.merge(df_books[['isbn', 'book_title']], on='isbn', how='left')
df_ext = df_ext.merge(df_pred[['isbn', 'user_id', 'pred_rating']], on=['isbn', 'user_id'], how='left')

In [25]:
selected_user_id = 92156
df_user = df_ext[df_ext['user_id']==selected_user_id]

df_user[(df_user['pred_rating'].isna())&(df_user['book_rating']>=9)].sample(10)df_user[df_user['pred_rating'].notna()].sort_values('pred_rating', ascending=False).head(5)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood,book_title,pred_rating
268860,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
268975,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
268960,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
268957,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
268911,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
268903,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
268988,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
269020,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
268934,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,
269033,1820,hearts in atlantis,stephen king,five interconnected sequential narratives set ...,Fiction,2000,Pocket,https://www.goodreads.com/book/show/11602.Hear...,"[(' sad', 64), (' attached', 17), (' happy', 1...","[(' sad', 60), (' adequate', 30), (' fearful',...",...,http://images.amazon.com/images/P/0671024248.0...,http://images.amazon.com/images/P/0671024248.0...,92156,9.0,"OrderedDict([('Melancholic', 65), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Fearful', ...","OrderedDict([('Melancholic', 125), ('Fearful',...",Melancholic,hearts in atlantis,


In [26]:
df_user[df_user['pred_rating'].notna()].sort_values('pred_rating', ascending=False).head(5)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood,book_title,pred_rating
505097,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505211,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505213,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505214,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505215,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487


In [27]:
df_user[df_user['pred_rating'].notna()].sort_values('book_rating', ascending=False).head(5)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood,book_title,pred_rating
505097,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505211,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505213,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505214,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487
505215,4132,the stone diaries,carol shields,stone diaries one ordinary woman story journey...,Fiction,1995,Penguin Books,https://www.goodreads.com/book/show/77554.The_...,"[(' happy', 15), (' sad', 14), (' alone', 11),...","[(' surprise', 30), (' average', 30), (' attac...",...,http://images.amazon.com/images/P/014023313X.0...,http://images.amazon.com/images/P/014023313X.0...,92156,7.0,"OrderedDict([('Melancholic', 17), ('Joyful', 1...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 17), ('Joyful', 1...",Melancholic,the stone diaries,7.464487


In [28]:
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import numpy as np
import os, sys
import re
from scipy.sparse import csr_matrix

In [29]:
df_ratings_top.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Publisher_y,Image-URL-S,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood
60,60,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,278418,8.001283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
61,61,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,4092,6.0,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
62,62,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,6900,8.501283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
63,63,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,8936,8.501283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
66,66,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,13552,8.101283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic


In [30]:
df_ratings_top.rename(columns={'user_id':'userID' ,'isbn':'ISBN','book_rating':'bookRating'},inplace=True)

In [31]:
df_ratings_top.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Publisher_y,Image-URL-S,Image-URL-M,Image-URL-L,userID,bookRating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood
60,60,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,278418,8.001283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
61,61,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,4092,6.0,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
62,62,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,6900,8.501283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
63,63,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,8936,8.501283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic
66,66,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,Signet Book,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,13552,8.101283,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic


In [32]:
ratings_matrix = df_ratings_top.pivot(index='userID', columns='ISBN', values='bookRating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix.head()

(3272, 135)


ISBN,014023313X,015100692X,034536676X,038542471X,039592720X,042510107X,044022165X,044023722X,044651652X,051512608X,...,671617028,671743058,679439382,679745203,679746048,743412028,786868716,802139256,804106304,871136791
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,,,,,,,,7.0,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,8.996179,
383,,,,7.767442,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,
507,,,,,,,,,,,...,,,,,,,,,,


In [33]:
n_users = ratings_matrix.shape[0] #considering only those users who gave explicit ratings
n_books = ratings_matrix.shape[1]
print (n_users, n_books)

3272 135


In [34]:
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)

In [35]:
ratings_matrix.head(5)

ISBN,014023313X,015100692X,034536676X,038542471X,039592720X,042510107X,044022165X,044023722X,044651652X,051512608X,...,671617028,671743058,679439382,679745203,679746048,743412028,786868716,802139256,804106304,871136791
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,0,0,0,0,0,0,0,7,0,0,...,0,0,0,0,0,0,0,0,0,0
254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,0
383,0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
sparsity = 1.0-len(ratings_df)/float(ratings_df.shape[0]*n_books)
print ('The sparsity level of Book Crossing dataset is ' +  str(sparsity*100) + ' %')

The sparsity level of Book Crossing dataset is 99.25925925925925 %


In [51]:

ratings_df.rename(columns={'User-ID':'userID','Book-Title':'bookTitle','Book-Rating':'bookRating'},inplace=True)
ratings_df.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Publisher_y,Image-URL-S,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood
0,0,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,4802,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
1,1,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,23902,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
2,2,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59305,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
3,3,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59495,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
4,4,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,128696,3.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic


In [54]:
ratings_df = ratings_df.dropna(axis = 0, subset = ['bookTitle'])
ratings_df.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Publisher_y,Image-URL-S,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood
0,0,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,4802,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
1,1,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,23902,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
2,2,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59305,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
3,3,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59495,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
4,4,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,128696,3.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic


In [56]:
book_ratingcount = (ratings_df.
                    groupby(by = ['bookTitle',])['book_rating'].
                    count().
                    reset_index().
                    rename(columns = {'book_rating':'TotalRatingCount'})
                    [['bookTitle','TotalRatingCount']])

In [57]:
book_ratingcount.head()

Unnamed: 0,bookTitle,TotalRatingCount
0,1421: the year china discovered america,10
1,1812,2
2,1984,192
3,2001: a space odyssey,24
4,2010: odyssey two,79


In [58]:
rating_with_totalratingcount = ratings_df.merge(book_ratingcount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'inner' )

In [59]:
rating_with_totalratingcount.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Image-URL-S,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood,TotalRatingCount
0,0,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,4802,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic,9
1,1,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,23902,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic,9
2,2,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59305,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic,9
3,3,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59495,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic,9
4,4,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,128696,3.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic,9


In [60]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingcount['TotalRatingCount'].describe())

count   1453.000
mean      31.736
std       67.248
min        1.000
25%        2.000
50%        7.000
75%       28.000
max      647.000
Name: TotalRatingCount, dtype: float64


In [61]:
print(book_ratingcount['TotalRatingCount'].quantile(np.arange(.9,1,.01)))

0.900    85.000
0.910    92.000
0.920   101.000
0.930   109.360
0.940   123.760
0.950   146.800
0.960   169.920
0.970   200.280
0.980   249.880
0.990   387.440
Name: TotalRatingCount, dtype: float64


In [62]:
popularity_threshold = 50
rating_popular_book = rating_with_totalratingcount.query('TotalRatingCount >= @popularity_threshold')

In [63]:
rating_popular_book.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Image-URL-S,Image-URL-M,Image-URL-L,user_id,book_rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood,TotalRatingCount
60,60,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,278418,8.001,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic,138
61,61,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,4092,6.0,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic,138
62,62,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,6900,8.501,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic,138
63,63,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,8936,8.501,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic,138
64,64,gerald's game,stephen king,game husband wife game gerald game time jessie...,Fiction,2001,Signet Book,https://www.goodreads.com/book/show/32692.Gera...,"[(' entitled', 14), (' fearful', 11), (' sad',...","[(' sad', 56), (' alone', 56), (' angry', 28)]",...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,http://images.amazon.com/images/P/0451176464.0...,9502,8.0,"OrderedDict([('Fearful', 15), ('Melancholic', ...","OrderedDict([('Melancholic', 56), ('Joyful', 0...","OrderedDict([('Melancholic', 69), ('Fearful', ...",Melancholic,138


In [68]:
if not rating_popular_book[rating_popular_book.duplicated(['user_id', 'Book'])].empty:
    initial_rows = rating_popular_book.shape[0]

    print('Initial dataframe shape {0}'.format(rating_popular_book.shape))
    rating_popular_book = rating_popular_book.drop_duplicates(['user_id', 'Book'])
    current_rows = rating_popular_book.shape[0]
    print('New dataframe shape {0}'.format(rating_popular_book.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

In [70]:
us_canada_user_rating_pivot = rating_popular_book.pivot(index = 'bookTitle',columns = 'user_id', values = 'book_rating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

In [71]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

In [72]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape((1, -1)), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for intensity:

1: false memory, with distance of 0.8707163554526742:
2: sleepers, with distance of 0.8744517100389062:
3: the pelican brief, with distance of 0.8794753369859801:
4: the tommyknockers, with distance of 0.879511961242895:
5: the runaway jury, with distance of 0.8905039741187155:


In [74]:
us_canada_user_rating_pivot2 = rating_popular_book.pivot(index = 'user_id', columns = 'bookTitle', values = 'book_rating').fillna(0)

In [75]:
us_canada_user_rating_pivot2.head()

bookTitle,1984,2010: odyssey two,a bend in the road,a confederacy of dunces,a fine balance,a heartbreaking work of staggering genius,a man in full,a night to remember,a painted house,a passage to india,...,total control,travels with charley: in search of america,walk two moons,we were the mulvaneys,west with the night,when the wind blows,where the red fern grows,wild swans: three daughters of china,windmills of the gods,year of wonders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
X = us_canada_user_rating_pivot2.values.T
X.shape

(238, 13835)

In [78]:
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

(238, 12)

In [79]:
corr = np.corrcoef(matrix)
corr.shape

(238, 238)

In [81]:
us_canada_book_title = us_canada_user_rating_pivot2.columns
us_canada_book_list = list(us_canada_book_title)
coffey_hands = us_canada_book_list.index("a night to remember")

In [82]:
corr_coffey_hands  = corr[coffey_hands]

In [83]:
list(us_canada_book_title[(corr_coffey_hands<1.0) & (corr_coffey_hands>0.9)])

['2010: odyssey two',
 'a man in full',
 'a night to remember',
 'as the crow flies',
 'beach music',
 "boy's life",
 'caribbean',
 'carnal innocence',
 'christy',
 'different seasons',
 'hearts in atlantis',
 'intensity',
 'message in a bottle',
 'mystic river',
 'oldest living confederate widow tells all',
 'one',
 'paradise lost',
 'pay it forward',
 'pet sematary',
 'sarum: the novel of england',
 'skipping christmas',
 'sleepers',
 "smilla's sense of snow",
 'starship troopers',
 'strip tease',
 'tara road',
 'the bonfire of the vanities',
 'the brethren',
 'the chamber',
 'the client',
 'the fountainhead',
 'the glass lake',
 'the last don',
 'the loop',
 'the partner',
 'the prince of tides',
 'the third twin',
 'the winner',
 'total control',
 'when the wind blows',
 'where the red fern grows']

#Collaborative filetering with MF (users with atleast 3 interactions, user+rating)

In [4]:
import scipy
import math
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import pandas as pd

ratings_df = pd.read_csv("/content/baseline_ratinsg.csv")
ratings_df.head()
ratings_df.rename(columns={'user_id':'User-ID','isbn':'ISBN','book_rating':'Book-Rating'},inplace=True)

In [249]:
zero_ratings_count = (ratings_df['Book-Rating'] == 0).sum()
print(zero_ratings_count)

0


In [6]:
users_interactions_count_df = ratings_df.groupby(['ISBN', 'User-ID']).size().groupby('User-ID').size()
print('# of users: %d' % len(users_interactions_count_df))

users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 3].reset_index()[['User-ID']]
print('# of users with at least 3 interactions: %d' % len(users_with_enough_interactions_df))

# of users: 16795
# of users with at least 3 interactions: 3451


In [7]:
print('# of interactions: %d' % len(ratings_df))
interactions_from_selected_users_df = ratings_df.merge(users_with_enough_interactions_df,
               how = 'right',
               left_on = 'User-ID',
               right_on = 'User-ID')
print('# of interactions from users with at least 3 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 46112
# of interactions from users with at least 3 interactions: 30359


In [8]:
interactions_from_selected_users_df.head(10)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Publisher_y,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood
0,5672,pet sematary,stephen king,alternate cover edition asin b00k3nee56 creeds...,Fiction,1994,Signet Book,https://www.goodreads.com/book/show/33124137-p...,"[(' happy', 25), (' sad', 18), (' attached', 1...","[(' loved', 30)]",...,Signet Book,http://images.amazon.com/images/P/0451162072.0...,http://images.amazon.com/images/P/0451162072.0...,http://images.amazon.com/images/P/0451162072.0...,243,7.853636,"OrderedDict([('Joyful', 25), ('Melancholic', 2...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Joyful', 25), ('Melancholic', 2...",Joyful
1,7423,a confederacy of dunces,john kennedy toole,meet ignatius j reilly hero john kennedy toole...,Fiction,1995,Outlet Books Company,https://www.goodreads.com/book/show/310612.A_C...,"[(' happy', 30), (' sad', 17), (' attached', 1...","[(' sad', 60), (' powerless', 30), (' focused'...",...,Outlet Books Company,http://images.amazon.com/images/P/0517122707.0...,http://images.amazon.com/images/P/0517122707.0...,http://images.amazon.com/images/P/0517122707.0...,243,8.446136,"OrderedDict([('Joyful', 30), ('Melancholic', 2...","OrderedDict([('Melancholic', 90), ('Joyful', 0...","OrderedDict([('Melancholic', 112), ('Joyful', ...",Melancholic
2,7939,a painted house,john grisham,hill people mexicans arrived day wednesday ear...,Fiction,2001,Dell Publishing Company,https://www.goodreads.com/book/show/5360.A_Pai...,"[(' happy', 14), (' attached', 12), (' attract...","[(' happy', 30)]",...,Dell Publishing Company,http://images.amazon.com/images/P/044023722X.0...,http://images.amazon.com/images/P/044023722X.0...,http://images.amazon.com/images/P/044023722X.0...,243,7.0,"OrderedDict([('Joyful', 15), ('Melancholic', 6...","OrderedDict([('Joyful', 30), ('Melancholic', 0...","OrderedDict([('Joyful', 45), ('Melancholic', 6...",Joyful
3,10007,congo,michael crichton,deep african rain forest near legendary ruins ...,Fiction,1995,Ballantine Books,https://www.goodreads.com/book/show/7672.Congo,"[(' happy', 17), (' lost', 12), (' attached', ...","[(' sad', 58), (' lost', 58), (' derailed', 29)]",...,Ballantine Books,http://images.amazon.com/images/P/0345378490.0...,http://images.amazon.com/images/P/0345378490.0...,http://images.amazon.com/images/P/0345378490.0...,243,7.388738,"OrderedDict([('Joyful', 19), ('Melancholic', 9...","OrderedDict([('Melancholic', 58), ('Joyful', 0...","OrderedDict([('Melancholic', 67), ('Joyful', 1...",Melancholic
4,10559,the brethren,john grisham,call brethren three disgraced former judges ti...,Fiction,2000,Doubleday,https://www.goodreads.com/book/show/5354.The_B...,"[(' happy', 9), (' sad', 5), (' attracted', 4)...","[(' belittled', 30)]",...,Doubleday,http://images.amazon.com/images/P/0385497466.0...,http://images.amazon.com/images/P/0385497466.0...,http://images.amazon.com/images/P/0385497466.0...,243,7.379537,"OrderedDict([('Joyful', 10), ('Melancholic', 9...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Joyful', 10), ('Melancholic', 9...",Joyful
5,14316,house of sand and fog,andre dubus iii,“ page turner beating heart ” boston globe rec...,Fiction,2000,Vintage Books,https://www.goodreads.com/book/show/38311414-h...,"[(' sad', 29), (' happy', 24), (' powerless', ...","[(' focused', 28), (' average', 28), (' angry'...",...,Vintage Books,http://images.amazon.com/images/P/0375727345.0...,http://images.amazon.com/images/P/0375727345.0...,http://images.amazon.com/images/P/0375727345.0...,243,7.461147,"OrderedDict([('Melancholic', 41), ('Joyful', 2...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Melancholic', 41), ('Joyful', 2...",Melancholic
6,17117,memoirs of a geisha,arthur golden,literary sensation runaway bestseller brillian...,Fiction,1997,Alfred A. Knopf,https://www.goodreads.com/book/show/929.Memoir...,"[(' happy', 34), (' attached', 23), (' sad', 1...",[],...,Alfred A. Knopf,http://images.amazon.com/images/P/0375400117.0...,http://images.amazon.com/images/P/0375400117.0...,http://images.amazon.com/images/P/0375400117.0...,243,10.0,"OrderedDict([('Joyful', 36), ('Melancholic', 2...","OrderedDict([('Melancholic', 0), ('Joyful', 0)...","OrderedDict([('Joyful', 36), ('Melancholic', 2...",Joyful
7,20026,the god of small things,arundhati roy,year 1969 state kerala southernmost tip india ...,Fiction,1998,Perennial,https://www.goodreads.com/book/show/9777.The_G...,"[(' happy', 29), (' sad', 25), (' attached', 1...","[(' alone', 30), (' fearful', 30)]",...,Perennial,http://images.amazon.com/images/P/0060977493.0...,http://images.amazon.com/images/P/0060977493.0...,http://images.amazon.com/images/P/0060977493.0...,243,7.0,"OrderedDict([('Melancholic', 33), ('Joyful', 3...","OrderedDict([('Fearful', 30), ('Melancholic', ...","OrderedDict([('Fearful', 37), ('Melancholic', ...",Fearful
8,22512,the king of torts,john grisham,office public defender known training ground b...,Fiction,2003,Doubleday Books,https://www.goodreads.com/book/show/5356.The_K...,"[(' happy', 10), (' sad', 7), (' attached', 6)...","[(' happy', 30)]",...,Doubleday Books,http://images.amazon.com/images/P/0385508042.0...,http://images.amazon.com/images/P/0385508042.0...,http://images.amazon.com/images/P/0385508042.0...,243,7.549366,"OrderedDict([('Melancholic', 12), ('Joyful', 1...","OrderedDict([('Joyful', 30), ('Melancholic', 0...","OrderedDict([('Joyful', 40), ('Melancholic', 1...",Joyful
9,24259,deception point,dan brown,shocking scientific discovery conspiracy stagg...,Fiction,2002,Pocket,https://www.goodreads.com/book/show/976.Decept...,"[(' happy', 9), (' attracted', 8), (' attached...","[(' fearless', 30), (' attracted', 30)]",...,Pocket,http://images.amazon.com/images/P/0671027387.0...,http://images.amazon.com/images/P/0671027387.0...,http://images.amazon.com/images/P/0671027387.0...,243,7.735786,"OrderedDict([('Joyful', 10), ('Romantic', 8), ...","OrderedDict([('Motivational', 30), ('Romantic'...","OrderedDict([('Romantic', 38), ('Motivational'...",Romantic


In [9]:
import math
def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df = interactions_from_selected_users_df.groupby(['ISBN', 'User-ID'])['Book-Rating'].sum().apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head()

# of unique user/item interactions: 30359


Unnamed: 0,ISBN,User-ID,Book-Rating
0,002037500X,21576,3.0
1,003061368X,16634,3.169925
2,006073941X,10118,3.459432
3,006073941X,124983,3.459432
4,006080971X,16488,3.387772


In [10]:
print(interactions_full_df['User-ID'].nunique())
print(interactions_full_df['User-ID'].value_counts())


3451
User-ID
11676     333
204864    141
35859     118
87555     114
60244     104
         ... 
108336      3
179826      3
91576       3
265498      3
209817      3
Name: count, Length: 3451, dtype: int64


#Split train test

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['User-ID'],
                                   test_size=0.30,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 21251
# interactions on Test set: 9108


In [13]:
print(interactions_test_df.head())
print(interactions_train_df.shape)
print(interactions_test_df.shape)

             ISBN  User-ID  Book-Rating
21003   452282977   136139     3.268080
18159   449220605    35859     3.079816
17721   446802204   124079     2.719880
2012   044651652X   224249     3.000000
18620   451155750    59172     3.169925
(21251, 3)
(9108, 3)


In [14]:
# Count the number of zero ratings in the 'Book-Rating' column
zero_ratings_count = (interactions_train_df['Book-Rating'] == 0).sum()
print(f"Number of zero ratings in the training dataset: {zero_ratings_count}")


Number of zero ratings in the training dataset: 0


In [15]:
# Count the number of zero ratings in the 'Book-Rating' column
zero_ratings_count = (interactions_test_df['Book-Rating'] == 0).sum()
print(f"Number of zero ratings in the testing dataset: {zero_ratings_count}")

Number of zero ratings in the testing dataset: 0


#Sparse matrix for SVD

In [16]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(index='User-ID',
                                                          columns='ISBN',
                                                          values='Book-Rating').fillna(0)

users_items_pivot_matrix_df.head()

ISBN,003061368X,006073941X,006080971X,006251279X,006440174X,014006110X,014006222X,014010870X,014011369X,014013168X,...,931933749,948984031,949206318,9501290670,958578346,963270702,965404560,99428644,99521016,99743914
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
users_items_pivot_matrix_df.shape

(3451, 1266)

In [18]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[243, 254, 383, 388, 507, 638, 651, 741, 805, 882]

#SVD

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import numpy as np

In [21]:
print(users_items_pivot_matrix.shape)  # This will print the shape of the matrix


(3451, 1266)


In [26]:
# The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15

#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [27]:
U.shape

(3451, 15)

In [28]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [29]:
Vt.shape

(15, 1266)

In [34]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
all_user_predicted_ratings

array([[-0.00827591, -0.0002301 ,  0.0072391 , ..., -0.00162238,
         0.03844796, -0.00697156],
       [ 0.0030312 , -0.00843547, -0.01419587, ..., -0.00326226,
         0.01594504, -0.02234935],
       [ 0.00080626,  0.00022719,  0.00284177, ..., -0.00027999,
         0.00107742, -0.00166307],
       ...,
       [ 0.03076449, -0.0045419 ,  0.07446648, ..., -0.00400765,
         0.04466243, -0.00974228],
       [ 0.00521664,  0.00021238,  0.00053498, ...,  0.0013757 ,
        -0.00776427,  0.00505238],
       [ 0.00010629, -0.00067591,  0.00170903, ...,  0.0005962 ,
         0.00369589, -0.00146195]])

In [52]:
all_user_predicted_ratings.shape
print(all_user_predicted_ratings.max())
print(all_user_predicted_ratings.min())

6.919881089399194
-4.727608561541226


#Normalizing predicted ratings

In [54]:
min_val = np.min(all_user_predicted_ratings)
max_val = np.max(all_user_predicted_ratings)

# Normalize to a range of 0 to 10
new_min = 0
new_max = 10
all_user_predicted_ratings = ((all_user_predicted_ratings - min_val) / (max_val - min_val)) * (new_max - new_min) + new_min

In [56]:
print(all_user_predicted_ratings)
print(all_user_predicted_ratings.max())
print(all_user_predicted_ratings.min())
users_items_pivot_matrix_df.columns

[[4.0518024  4.05871016 4.06512288 ... 4.05751482 4.09191737 4.05292226]
 [4.06151017 4.05166541 4.04671979 ... 4.05610689 4.0725974  4.03971959]
 [4.05959994 4.05910277 4.06134753 ... 4.05866733 4.05983274 4.05747988]
 ...
 [4.08532069 4.05500825 4.12284122 ... 4.05546693 4.09725283 4.05054344]
 [4.06338648 4.05909006 4.05936703 ... 4.06008883 4.05224167 4.06324546]
 [4.05899897 4.05832741 4.06037501 ... 4.05941959 4.06208084 4.05765255]]
10.0
0.0


Index(['003061368X', '006073941X', '006080971X', '006251279X', '006440174X',
       '014006110X', '014006222X', '014010870X', '014011369X', '014013168X',
       ...
       '931933749', '948984031', '949206318', '9501290670', '958578346',
       '963270702', '965404560', '99428644', '99521016', '99743914'],
      dtype='object', name='ISBN', length=1266)

In [57]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head()

Unnamed: 0_level_0,243,254,383,388,507,638,651,741,805,882,...,277965,278137,278144,278188,278194,278257,278390,278418,278633,278843
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
003061368X,4.051802,4.06151,4.0596,4.058849,4.057195,4.061054,4.058842,4.060817,4.055029,4.063345,...,4.065483,4.058169,4.058419,4.059853,4.058536,4.060333,4.05932,4.085321,4.063386,4.058999
006073941X,4.05871,4.051665,4.059103,4.05799,4.069015,4.061299,4.058863,4.05835,4.056758,4.058265,...,4.056618,4.060324,4.060475,4.060625,4.058053,4.059509,4.057781,4.055008,4.05909,4.058327
006080971X,4.065123,4.04672,4.061348,4.058764,4.058259,4.060645,4.058079,4.063468,4.083373,4.062793,...,4.100454,4.069422,4.046163,4.061912,4.05795,4.06495,4.059796,4.122841,4.059367,4.060375
006251279X,4.036258,4.106193,4.07081,4.0677,4.042033,4.065464,4.06729,4.04017,4.071095,4.056928,...,4.05156,4.049619,4.074076,4.065671,4.077317,4.068087,4.059012,4.102967,4.071583,4.074407
006440174X,4.052576,4.063669,4.057962,4.065613,4.041128,4.078309,4.058658,4.053909,4.075135,4.103454,...,4.118961,4.067452,4.055912,4.057374,4.066933,4.058401,4.070213,4.052183,4.097688,4.060199


In [58]:
len(cf_preds_df.columns)

3451

In [59]:
global ratings_df
ratings_df.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Year of Publication,Publisher_x,URL,Aggregated Emotions,Aggregated Des Emotions,...,Publisher_y,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,Sorted Buckets,Sorted Buckets desc,Total Buckets,Max Mood
0,0,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,4802,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
1,1,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,23902,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
2,2,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59305,7.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
3,3,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,59495,5.666667,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic
4,4,my idea of fun,will self,self established one brilliant daring inventiv...,Fiction,1995,Vintage,https://www.goodreads.com/book/show/119116.My_...,"[(' happy', 9), (' sad', 7), (' fearful', 6), ...","[(' average', 60), (' sad', 60), (' fearless',...",...,Vintage,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,http://images.amazon.com/images/P/0679750932.0...,128696,3.0,"OrderedDict([('Melancholic', 10), ('Joyful', 1...","OrderedDict([('Melancholic', 60), ('Motivation...","OrderedDict([('Melancholic', 70), ('Motivation...",Melancholic


In [41]:
books = pd.read_csv("/content/Books.csv")

  books = pd.read_csv("/content/Books.csv")


#Collaborative Filtering

In [60]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Assuming your CFRecommender class has a method recommend_items that can return ratings predictions
# First, extend your CFRecommender class to include a method to predict ratings for a given user and item

class CFRecommender:
    MODEL_NAME = 'Collaborative Filtering'

    def __init__(self, cf_predictions_df):
        self.cf_predictions_df = cf_predictions_df

    def get_model_name(self):
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_ignore=[], topn=10):
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'recStrength'})
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['ISBN'].isin(items_to_ignore)].sort_values('recStrength', ascending=False).head(topn)
        return recommendations_df

    def predict_rating(self, user_id, item_id):
        if user_id in self.cf_predictions_df.columns and item_id in self.cf_predictions_df.index:
            return self.cf_predictions_df.loc[item_id, user_id]
        else:
            return np.nan  # Return NaN for user/item combinations not in the matrix


cf_recommender_model = CFRecommender(cf_preds_df)

# Now, predict ratings for all user-item pairs in the test set
test_users = interactions_test_df['User-ID']
test_items = interactions_test_df['ISBN']
predicted_ratings = [cf_recommender_model.predict_rating(user, item) for user, item in zip(test_users, test_items)]

# Add these predictions back to the test dataframe
interactions_test_df['predicted_rating'] = predicted_ratings

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(interactions_test_df['Book-Rating'], interactions_test_df['predicted_rating'].fillna(0)))
print(f"RMSE: {rmse}")

RMSE: 1.1607200574942647


In [61]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('User-ID')
interactions_train_indexed_df = interactions_train_df.set_index('User-ID')
interactions_test_indexed_df = interactions_test_df.set_index('User-ID')

In [62]:
def get_items_interacted(UserID, interactions_df):
    interacted_items = interactions_df.loc[UserID]['ISBN']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [63]:
class ModelRecommender:

    # Function for getting the set of items which a user has not interacted with
    def get_not_interacted_items_sample(self, UserID, sample_size, seed=42):
        interacted_items = get_items_interacted(UserID, interactions_full_indexed_df)
        all_items = set(ratings_df['ISBN'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    # Function to verify whether a particular item_id was present in the set of top N recommended items
    def _verify_hit_top_n(self, item_id, recommended_items, topn):
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    # Function to evaluate the performance of model for each user
    def evaluate_model_for_user(self, model, person_id):

        # Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]

        if type(interacted_values_testset['ISBN']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['ISBN'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['ISBN'])])

        interacted_items_count_testset = len(person_interacted_items_testset)

        # Getting a ranked recommendation list from the model for a given user
        person_recs_df = model.recommend_items(person_id, items_to_ignore=get_items_interacted(person_id, interactions_train_indexed_df),topn=10000000000)
        print('Recommendation for User-ID = ',person_id)
        print(person_recs_df.head(10))

        # Function to evaluate the performance of model at overall level
    def recommend_book(self, model ,userid):

        person_metrics = self.evaluate_model_for_user(model, userid)
        return

model_recommender = ModelRecommender()

In [64]:
print(list(interactions_full_indexed_df.index.values))

[21576, 16634, 10118, 124983, 16488, 35859, 52584, 115120, 148744, 179978, 218121, 2030, 8529, 11601, 11676, 23124, 43246, 55187, 81207, 87974, 96054, 112765, 115003, 186762, 190831, 216099, 61427, 112083, 142093, 166596, 230522, 247751, 269566, 278144, 16106, 36003, 37712, 43323, 107301, 186848, 271622, 11676, 60277, 217106, 230522, 5555, 14422, 15408, 26544, 36836, 66942, 87555, 88693, 107301, 127359, 147141, 175003, 175117, 234765, 242646, 30716, 36836, 95359, 98787, 110934, 113519, 133771, 138578, 156150, 161936, 163804, 193458, 200226, 203968, 204864, 222296, 226545, 251422, 70594, 76626, 88693, 92861, 96608, 114446, 124079, 133747, 147451, 180957, 186848, 218411, 274004, 60797, 136205, 143415, 162155, 170229, 172742, 199178, 236757, 6251, 11601, 21014, 30531, 55006, 62891, 62958, 68447, 80538, 87555, 105937, 109901, 120548, 131510, 154730, 161041, 164581, 174304, 189835, 195469, 209160, 217106, 221948, 225989, 245963, 247447, 264317, 266226, 269835, 3363, 78834, 138441, 174304, 2

In [65]:
user=int(input("Enter User ID from above list for book recommendation  "))
model_recommender.recommend_book(cf_recommender_model,user)

Enter User ID from above list for book recommendation  21576
Recommendation for User-ID =  21576
          ISBN  recStrength
1    446310786     4.453641
2    446608955     4.449921
3    440235596     4.401799
4     60987561     4.385292
5   044651652X     4.383128
6    385721420     4.366012
8    316969443     4.329191
9    804106304     4.310872
10   385505833     4.287423
11   375727345     4.285632


#Evaluation

In [67]:
import random
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:

    # Function for getting the set of items which a user has not interacted with
    def get_not_interacted_items_sample(self, UserID, sample_size, seed=42):
        interacted_items = get_items_interacted(UserID, interactions_full_indexed_df)
        all_items = set(ratings_df['ISBN'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    # Function to verify whether a particular item_id was present in the set of top N recommended items
    def _verify_hit_top_n(self, item_id, recommended_items, topn):
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    # Function to evaluate the performance of model for each user
    def evaluate_model_for_user(self, model, person_id):

        # Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]

        if type(interacted_values_testset['ISBN']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['ISBN'])
        else:
            person_interacted_items_testset = set([str(interacted_values_testset['ISBN'])])

        interacted_items_count_testset = len(person_interacted_items_testset)

        # Getting a ranked recommendation list from the model for a given user
        person_recs_df = model.recommend_items(person_id, items_to_ignore=get_items_interacted(person_id, interactions_train_indexed_df),topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0

        # For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:

            # Getting a random sample of 100 items the user has not interacted with
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, seed=item_id)    #%(2**32))

            # Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            # Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['ISBN'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['ISBN'].values

            # Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        # Recall is the rate of the interacted items that are ranked among the Top-N recommended items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count,
                          'hits@10_count':hits_at_10_count,
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics


    # Function to evaluate the performance of model at overall level
    def evaluate_model(self, model):

        people_metrics = []

        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics['User-ID'] = person_id
            people_metrics.append(person_metrics)

        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics).sort_values('interacted_count', ascending=False)

        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())

        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}
        return global_metrics, detailed_results_df

model_evaluator = ModelEvaluator()

In [None]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)

print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

In [189]:
ratings_df.columns

Index(['Unnamed: 0', 'Book', 'Author', 'Description', 'Genres',
       'Year of Publication', 'Publisher_x', 'URL', 'Aggregated Emotions',
       'Aggregated Des Emotions', 'ISBN', 'bookTitle', 'Publisher_y',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'User-ID', 'Book-Rating',
       'Sorted Buckets', 'Sorted Buckets desc', 'Total Buckets', 'Max Mood'],
      dtype='object')

#NDCG

In [68]:
import numpy as np

def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k, method=1):
    """Calculate nDCG at rank k"""
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


In [69]:
def evaluate_model(model, data, k=10):
    ndcg_values = []
    for user_id in data['User-ID'].unique():
        # True data
        true_ratings = data[data['User-ID'] == user_id]
        true_ratings = true_ratings.set_index('ISBN')['Book-Rating']

        # Predicted ratings
        predicted_ratings = model.cf_predictions_df.get(user_id)
        if predicted_ratings is None:
            continue

        # Filter to only books the user has rated
        predicted_ratings = predicted_ratings[predicted_ratings.index.isin(true_ratings.index)]

        # Sort by predicted rating
        predicted_ratings = predicted_ratings.sort_values(ascending=False)

        # Get the actual ratings in the order of predicted ratings
        sorted_actual_ratings = true_ratings.loc[predicted_ratings.index].tolist()

        # Calculate nDCG
        ndcg_value = ndcg_at_k(sorted_actual_ratings, k)
        ndcg_values.append(ndcg_value)

    # Average nDCG over all users
    average_ndcg = np.mean(ndcg_values)
    print(f"Average nDCG@{k}: {average_ndcg}")

# Sample usage:
evaluate_model(cf_recommender_model, interactions_test_df, k=10)


Average nDCG@10: 0.9865716363253025
