In [9]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from textblob.tokenizers import word_tokenize

# Load books-data.csv file

In [10]:
dataset= pd.read_csv('books-data.csv')
dataset

Unnamed: 0,book_id,title,user_id,rating,review
0,1,Harry Potter and the Half-Blood Prince,314,5,Perfect for a potterhead
1,1,Harry Potter and the Half-Blood Prince,439,3,Watching movie will suck after reading this
2,1,Harry Potter and the Half-Blood Prince,588,5,Great book
3,1,Harry Potter and the Half-Blood Prince,1169,4,Breathtaking story! Heartbreaking end!
4,1,Harry Potter and the Half-Blood Prince,1185,4,Awesome book.
...,...,...,...,...,...
5395,54,Bound by Honor,50342,4,very good
5396,54,Bound by Honor,51480,1,bad
5397,54,Bound by Honor,51762,5,interesting
5398,54,Bound by Honor,52036,3,can do better


# finding sentiment/polarity of reviews 
### making new column in dataset of polarity

In [23]:
stop_words = set(stopwords.words('english'))
def find_sentiment(review):
    global stop_words

    blob= TextBlob(review)
    words= blob.words
    sentence= [word for word in words if word not in stop_words]
    sent= " ".join(sentence)   
    blob2= TextBlob(sent)
    
    return blob2.sentiment.polarity

dataset['sentiment']= dataset['review'].apply(find_sentiment)
dataset

Unnamed: 0,book_id,title,user_id,rating,review,sentiment
0,1,Harry Potter and the Half-Blood Prince,314,5,Perfect for a potterhead,1.0
1,1,Harry Potter and the Half-Blood Prince,439,3,Watching movie will suck after reading this,0.0
2,1,Harry Potter and the Half-Blood Prince,588,5,Great book,0.8
3,1,Harry Potter and the Half-Blood Prince,1169,4,Breathtaking story! Heartbreaking end!,1.0
4,1,Harry Potter and the Half-Blood Prince,1185,4,Awesome book.,1.0
...,...,...,...,...,...,...
5395,54,Bound by Honor,50342,4,very good,0.7
5396,54,Bound by Honor,51480,1,bad,-0.7
5397,54,Bound by Honor,51762,5,interesting,0.5
5398,54,Bound by Honor,52036,3,can do better,0.5


# combining rating and sentiment
### creating new column     score = rating * sentiment

In [26]:
dataset['score']= dataset['rating']*dataset['sentiment']
dataset

Unnamed: 0,book_id,title,user_id,rating,review,sentiment,score
0,1,Harry Potter and the Half-Blood Prince,314,5,Perfect for a potterhead,1.0,5.0
1,1,Harry Potter and the Half-Blood Prince,439,3,Watching movie will suck after reading this,0.0,0.0
2,1,Harry Potter and the Half-Blood Prince,588,5,Great book,0.8,4.0
3,1,Harry Potter and the Half-Blood Prince,1169,4,Breathtaking story! Heartbreaking end!,1.0,4.0
4,1,Harry Potter and the Half-Blood Prince,1185,4,Awesome book.,1.0,4.0
...,...,...,...,...,...,...,...
5395,54,Bound by Honor,50342,4,very good,0.7,2.8
5396,54,Bound by Honor,51480,1,bad,-0.7,-0.7
5397,54,Bound by Honor,51762,5,interesting,0.5,2.5
5398,54,Bound by Honor,52036,3,can do better,0.5,1.5


# classifying score in range from 1 to 5

In [37]:
def classify_score(score):
    if score >= -5 and score <= -0.75:
        return 1
    if score > -0.75 and score <0:
        return 2
    if score>=0 and score <2:
        return 3
    if score>=2 and score<3.5:
        return 4
    if score>=3.5 and score<=5:
        return 5

dataset['newscore']= dataset['score'].apply(classify_score)
dataset

Unnamed: 0,book_id,title,user_id,rating,review,sentiment,score,newscore
0,1,Harry Potter and the Half-Blood Prince,314,5,Perfect for a potterhead,1.0,5.0,5
1,1,Harry Potter and the Half-Blood Prince,439,3,Watching movie will suck after reading this,0.0,0.0,3
2,1,Harry Potter and the Half-Blood Prince,588,5,Great book,0.8,4.0,5
3,1,Harry Potter and the Half-Blood Prince,1169,4,Breathtaking story! Heartbreaking end!,1.0,4.0,5
4,1,Harry Potter and the Half-Blood Prince,1185,4,Awesome book.,1.0,4.0,5
...,...,...,...,...,...,...,...,...
5395,54,Bound by Honor,50342,4,very good,0.7,2.8,4
5396,54,Bound by Honor,51480,1,bad,-0.7,-0.7,2
5397,54,Bound by Honor,51762,5,interesting,0.5,2.5,4
5398,54,Bound by Honor,52036,3,can do better,0.5,1.5,3


# creating pivot table with book_id,user_id and newscore

In [38]:
dataset_pivot= dataset.pivot_table(index='book_id',columns='user_id',values='newscore').fillna(0)
dataset_pivot

user_id,173,314,368,439,588,725,951,1088,1136,1169,...,52007,52036,52469,52583,52740,52929,53145,53245,53292,53293
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,5.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,5.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0
5,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
6,1.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,1.0,5.0,3.0,1.0,0.0,5.0,0.0,4.0,0.0
7,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
9,0.0,0.0,0.0,4.0,3.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


# creating csr (compressed sparse row) matrix for model

In [40]:
from scipy.sparse import csr_matrix
dataset_pivot_matrix= csr_matrix(dataset_pivot.values)
print(dataset_pivot_matrix)

  (0, 1)	5.0
  (0, 3)	3.0
  (0, 4)	5.0
  (0, 9)	5.0
  (0, 10)	5.0
  (0, 14)	3.0
  (0, 18)	5.0
  (0, 20)	4.0
  (0, 28)	4.0
  (0, 30)	3.0
  (0, 37)	3.0
  (0, 39)	3.0
  (0, 42)	3.0
  (0, 49)	3.0
  (0, 56)	3.0
  (0, 80)	3.0
  (0, 91)	4.0
  (0, 92)	3.0
  (0, 93)	3.0
  (0, 97)	3.0
  (0, 101)	3.0
  (0, 106)	5.0
  (0, 119)	5.0
  (0, 121)	4.0
  (0, 128)	5.0
  :	:
  (53, 391)	5.0
  (53, 395)	3.0
  (53, 404)	5.0
  (53, 411)	4.0
  (53, 416)	4.0
  (53, 422)	5.0
  (53, 426)	5.0
  (53, 427)	4.0
  (53, 431)	5.0
  (53, 441)	5.0
  (53, 444)	5.0
  (53, 446)	5.0
  (53, 450)	5.0
  (53, 451)	5.0
  (53, 452)	4.0
  (53, 453)	4.0
  (53, 459)	5.0
  (53, 460)	4.0
  (53, 461)	4.0
  (53, 464)	4.0
  (53, 466)	4.0
  (53, 475)	2.0
  (53, 477)	4.0
  (53, 480)	3.0
  (53, 487)	4.0


# fitting data into NearestNeighbors model

In [41]:
from sklearn.neighbors import NearestNeighbors

model= NearestNeighbors(metric= 'cosine')
model.fit(dataset_pivot_matrix)

NearestNeighbors(metric='cosine')

# cosine similarity matrix of books
### cosine similarity means how much similar is one book with respect to other books

In [47]:
similarity_matrix= cosine_similarity(dataset_pivot)
print(similarity_matrix)
similarity_matrix.shape

[[1.         0.53186093 0.5057734  ... 0.21238901 0.27808545 0.24317571]
 [0.53186093 1.         0.50704721 ... 0.20574916 0.3531659  0.33913849]
 [0.5057734  0.50704721 1.         ... 0.36979464 0.32416996 0.36961384]
 ...
 [0.21238901 0.20574916 0.36979464 ... 1.         0.23311113 0.20885084]
 [0.27808545 0.3531659  0.32416996 ... 0.23311113 1.         0.30083012]
 [0.24317571 0.33913849 0.36961384 ... 0.20885084 0.30083012 1.        ]]


(54, 54)

# creating a list having book_id as its values

In [48]:
bookid= list(dataset_pivot.index)
print(bookid)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]


# taking book_id as user input

In [49]:
book= int(input("Enter book id: "))
num= int(input("Enter no. of recommendations: "))
book_index= bookid.index(book)
print(book_index)

Enter book id: 48
Enter no. of recommendations: 5
47


# predicting books using model

In [58]:
distance,indices= model.kneighbors(dataset_pivot.iloc[book_index,:].values.reshape(1,-1), n_neighbors= num+1)
dist= distance.reshape(-1)
ind= indices.reshape(-1)
print(dist)
print(ind)

[1.44328993e-15 3.54176503e-01 3.93086142e-01 4.46046155e-01
 4.56774800e-01 4.65251473e-01]
[47 13 12 15  1 18]


# creating dictionary to store recommendations of book

In [61]:
recommend_data= dict()
lst=[]
for i in range(1,len(dist)):
    b_id= dataset_pivot.index[ind[i]]
    recommend_data[str(b_id)]= float(dist[i])
    print("Distance of book with id {} with respect to book with id {} = {}".format(b_id,book,recommend_data[str(b_id)]))
    lst.append(dataset.loc[dataset['book_id']==b_id]['title'].values[0])
    
print("Dictionary with similarity distance: \n",recommend_data)
print(recommend_data)

Distance of book with id 14 with respect to book with id 48 = 0.35417650253418054
Distance of book with id 13 with respect to book with id 48 = 0.3930861417212055
Distance of book with id 16 with respect to book with id 48 = 0.44604615528995273
Distance of book with id 2 with respect to book with id 48 = 0.45677480017335037
Distance of book with id 19 with respect to book with id 48 = 0.4652514728496151
Dictionary with similarity distance: 
 {'14': 0.35417650253418054, '13': 0.3930861417212055, '16': 0.44604615528995273, '2': 0.45677480017335037, '19': 0.4652514728496151}
{'14': 0.35417650253418054, '13': 0.3930861417212055, '16': 0.44604615528995273, '2': 0.45677480017335037, '19': 0.4652514728496151}


# Recommending books

In [62]:
print("Top {} recommendations for readers of book: {} ".format(num,dataset.loc[dataset['book_id']==book]['title'].values[0]))
print("Books recommended are: ")
print("\n".join(lst))

Top 5 recommendations for readers of book: God Help the Child 
Books recommended are: 
The Salmon of Doubt: Hitchhiking the Galaxy One Last Time
Job: A Comedy of Justice
City of Glass
Harry Potter and the Order of the Phoenix
The Virtue of Selfishness: A New Concept of Egoism
