In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
Final_Dataset2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Book Recommendation System/Final_Dataset2.csv')
Final_Dataset2 = Final_Dataset2[['User-ID','Age','Country',	'ISBN',	'Book-Rating'	,'Avg_Rating',	'Total_No_Of_Users_Rated',	'Book-Title',	'Book-Author','Year-Of-Publication',	'Publisher']]
Final_Dataset2.head()

Unnamed: 0,User-ID,Age,Country,ISBN,Book-Rating,Avg_Rating,Total_No_Of_Users_Rated,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,8,33.0,canada,2005018,5,7.666667,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
1,11676,28.0,,2005018,8,7.666667,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
2,67544,30.0,canada,2005018,8,7.666667,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
3,116866,32.0,other,2005018,9,7.666667,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
4,123629,33.0,canada,2005018,9,7.666667,9,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada


# **Memory-Based Collaborative Filtering**

This approach uses the memory of previous users interactions to compute users similarities based on items they've interacted (user-based approach) or compute items similarities based on the users that have interacted with them (item-based approach).

# Train-Test Split

In [None]:
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(Final_Dataset2, test_size=0.20)

In [None]:
print(f'Training set lengths: {len(train_data)}')
print(f'Testing set lengths: {len(test_data)}')
print(f'Test set is {(len(test_data)/(len(train_data)+len(test_data))*100):.0f}% of the full dataset.')

Training set lengths: 307073
Testing set lengths: 76769
Test set is 20% of the full dataset.


In [None]:
# Get int mapping for user_id in train dataset
u_unique_train = train_data['User-ID'].unique()
train_data_user2idx = {o:i for i, o in enumerate(u_unique_train)}

# Get int mapping for isbn in train dataset
i_unique_train = train_data.ISBN.unique()
train_data_book2idx = {o:i for i, o in enumerate(i_unique_train)}

In [None]:
# Get int mapping for user_id in test dataset
u_unique_test = test_data['User-ID'].unique()
test_data_user2idx = {o:i for i, o in enumerate(u_unique_test)}

# Get int mapping for isbn in test dataset
i_unique_test = test_data.ISBN.unique()
test_data_book2idx = {o:i for i, o in enumerate(i_unique_test)}

In [None]:
# training set
train_data['u_unique'] = train_data['User-ID'].map(train_data_user2idx)
train_data['i_unique'] = train_data['ISBN'].map(train_data_book2idx)

# testing set
test_data['u_unique'] = test_data['User-ID'].map(test_data_user2idx)
test_data['i_unique'] = test_data['ISBN'].map(test_data_book2idx)

# Convert back to three feature of dataframe
train_data = train_data[['u_unique', 'i_unique', 'Book-Rating']]
test_data = test_data[['u_unique', 'i_unique', 'Book-Rating']]

In [None]:
train_data.sample(5)

Unnamed: 0,u_unique,i_unique,Book-Rating
237189,20650,127420,9
121099,5017,3560,9
140750,5931,8878,6
83271,2170,2068,8
295256,5451,49007,10


# User-Item for Train Data

In [None]:
# first I'll create an empty matrix of users books and then I'll add the appropriate values to the matrix by extracting them from the dataset
n_users = train_data['u_unique'].nunique()
n_books = train_data['i_unique'].nunique()

train_matrix = np.zeros((n_users, n_books))

for entry in train_data.itertuples():
    train_matrix[entry[1]-1, entry[2]-1] = entry[3]    # entry[1] is the user-id, entry[2] is the book-isbn and -1 is to counter 0-based indexing

In [None]:
train_matrix.shape

(60021, 128845)

# User-Item for Test Data

In [None]:
n_users = test_data['u_unique'].nunique()
n_books = test_data['i_unique'].nunique()

test_matrix = np.zeros((n_users, n_books))

for entry in test_data.itertuples():
    test_matrix[entry[1]-1, entry[2]-1] = entry[3]

In [None]:
test_matrix.shape

(25849, 46812)

# **Cosine Similarity Based Recommendation System**

A distance metric commonly used in recommender systems is cosine similarity, where the ratings are seen as vectors in n-dimensional space and the similarity is calculated based on the angle between these vectors.

In [None]:
# To make item-item similarity we need to take the transpose of the matrix
train_matrix_small = train_matrix[:5000, :5000]
test_matrix_small = test_matrix[:5000, :5000]

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_matrix_small, metric='cosine')
item_similarity = pairwise_distances(train_matrix_small.T, metric='cosine')

In [None]:
# function to predict the similarity :
def predict_books(ratings, similarity, type='user'): # default type is 'user'
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)

        # Use np.newaxis so that mean_user_rating has the same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
item_prediction = predict_books(train_matrix_small, item_similarity, type='item')
user_prediction = predict_books(train_matrix_small, user_similarity, type='user')

# **Evaluation Metric**

In [None]:
# Evaluation metric by mean squared error
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, test_matrix):
    prediction = prediction[test_matrix.nonzero()].flatten()
    test_matrix = test_matrix[test_matrix.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, test_matrix))

print(f'Item-based CF RMSE: {rmse(item_prediction, test_matrix_small)}')
print(f'User-based CF RMSE: {rmse(user_prediction, test_matrix_small)}')

Item-based CF RMSE: 7.960372815566727
User-based CF RMSE: 7.959470472772056


By cosine similarity in recommendation system it gives 7.94 RMSE score.

We can make improvement in this score by using another method. Let's use Single Value Decomposition model (SVD) model to implement.

# **Let's go through Model based approach by SVD model.**


In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163004 sha256=70340c348151da6bbff0fad426bf5de7955dedd13f2264bcf514ea72092a43e4
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
from surprise import Reader, Dataset

In [None]:
ratings_explicit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Book Recommendation System/Ratings_explicit.csv')
ratings_explicit = ratings_explicit[['User-ID','ISBN','Book-Rating']]
ratings_explicit.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276726,0155061224,5
1,276729,052165615X,3
2,276729,0521795028,6
3,276744,038550120X,7
4,276747,0060517794,9


In [None]:
# Creating a 'Reader' object to set the limit of the ratings
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings_explicit, reader)

In [None]:
from surprise import SVD, model_selection, accuracy
model = SVD()

# Train on books dataset
%time model_selection.cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6323  1.6353  1.6384  1.6324  1.6397  1.6356  0.0030  
Fit time          19.68   10.82   9.84    10.73   9.34    12.08   3.84    
Test time         1.22    0.94    0.99    0.86    1.24    1.05    0.15    
CPU times: user 1min 5s, sys: 666 ms, total: 1min 5s
Wall time: 1min 16s


{'test_rmse': array([1.63227935, 1.63533398, 1.63839945, 1.6323909 , 1.63970983]),
 'fit_time': (19.678489208221436,
  10.822802782058716,
  9.84268569946289,
  10.732442140579224,
  9.33881163597107),
 'test_time': (1.219881534576416,
  0.9354007244110107,
  0.9850866794586182,
  0.8640668392181396,
  1.2424027919769287)}

# Train - Test Split

In [None]:
# train and test split
trainset, testset = model_selection.train_test_split(data, test_size=0.2)

# SVD model
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x783a1098e200>

# Evaluation metrics for SVD model


In [None]:
# displaying RMSE score
predictions = model.test(testset)
print(f"The accuracy is {accuracy.rmse(predictions)}")


RMSE: 1.6409
The accuracy is 1.6408999079829831


# Testing Result

In [None]:
# to test result let's take an user-id and item-id to test our model.
uid = 276744
iid = '038550120X'
pred = model.predict(uid, iid, verbose=True)

user: 276744     item: 038550120X r_ui = None   est = 7.06   {'was_impossible': False}


Now let's display estimated rating and real rating



In [None]:
# display estimated rating and real rating
print(f'The estimated rating for the book with ISBN code {pred.iid} from user #{pred.uid} is {pred.est:.2f}.\n')
actual_rtg= ratings_explicit[(ratings_explicit['User-ID']==pred.uid) &
                             (ratings_explicit.ISBN==pred.iid)]['Book-Rating'].values[0]
print(f'The real rating given for this was {actual_rtg:.2f}.')

The estimated rating for the book with ISBN code 038550120X from user #276744 is 7.06.

The real rating given for this was 7.00.


In [None]:
# The following function was adapted from the surprise docs
# and can be used to get the top book recommendations for each user.
from collections import defaultdict

def get_top_n(predictions, n=10):

    # First map the predictions to each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
# prediciton on teset and getting top predictions
pred = model.test(testset)
top_n = get_top_n(pred)

In [None]:
pred_testset_df = pd.DataFrame(pred)
pred_testset_df


Unnamed: 0,uid,iid,r_ui,est,details
0,202358,0312282354,8.0,7.603149,{'was_impossible': False}
1,76576,0553263218,2.0,7.327080,{'was_impossible': False}
2,60583,0142001740,7.0,7.935574,{'was_impossible': False}
3,57756,0553125281,10.0,7.628391,{'was_impossible': False}
4,201353,0590483404,6.0,6.473424,{'was_impossible': False}
...,...,...,...,...,...
76764,130474,0803282109,10.0,8.925286,{'was_impossible': False}
76765,57595,044024126X,8.0,7.525645,{'was_impossible': False}
76766,55144,0969885520,2.0,7.893176,{'was_impossible': False}
76767,32516,0440211727,8.0,8.812160,{'was_impossible': False}


We can view predictions from testset that we performed with SVD model.

## Retrieving full book titles from full 'books_users_ratings' dataframe


In [None]:
def get_reading_list(userid):

    reading_list = defaultdict(list)
    top_n = get_top_n(pred, n=10)
    for n in top_n[userid]:
        book, rating = n
        title = Final_Dataset2.loc[Final_Dataset2.ISBN==book]['Book-Rating'].unique()[0]
        reading_list[title] = rating
    return reading_list

In [None]:
#take a random look at user_id
example_reading_list = get_reading_list(userid=116866)
for book, rating in example_reading_list.items():
    print(f'{book}: {rating}')

10: 7.7556075441176375
9: 7.7556075441176375
8: 7.7556075441176375
7: 7.7556075441176375


As we can observe above we've got top recommendation of books and ratings with respective to it. So this was my recommendation system for BOOK RECOMMENDATION SYSTEM.
