In [33]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [34]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation
from sklearn.feature_extraction.text import CountVectorizer


In [35]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#import Dataset
book = pd.read_csv('book.csv',encoding='Latin1')
book

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [4]:
#shape of dataframe
book.shape

(10000, 4)

In [5]:
# unique user_id column
book['User.ID'].unique()

array([276726, 276729, 276736, ..., 162113, 162121, 162129], dtype=int64)

In [6]:
# number of unique users in the dataset
len(book['User.ID'].unique())

2182

In [7]:
# unique user_id column
book['Book.Title'].unique()

array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,
       'How to Flirt: A Practical Guide', 'Twilight',
       'Kids Say the Darndest Things'], dtype=object)

In [8]:
# number of unique books in the dataset
len(book['Book.Title'].unique())

9659

In [9]:
# Book Rating value_count
book['Book.Rating'].value_counts()

8     2283
7     2076
10    1732
9     1493
5     1007
6      920
4      237
3      146
2       63
1       43
Name: Book.Rating, dtype: int64

In [10]:
book.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   10000 non-null  int64 
 1   User.ID      10000 non-null  int64 
 2   Book.Title   10000 non-null  object
 3   Book.Rating  10000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 312.6+ KB


In [11]:
book.describe()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Rating
count,10000.0,10000.0,10000.0
mean,5000.5,95321.2498,7.5663
std,2886.89568,117645.703609,1.82152
min,1.0,8.0,1.0
25%,2500.75,2103.0,7.0
50%,5000.5,3757.0,8.0
75%,7500.25,162052.0,9.0
max,10000.0,278854.0,10.0


In [12]:
book.isnull().any()

Unnamed: 0     False
User.ID        False
Book.Title     False
Book.Rating    False
dtype: bool

In [13]:
book.isnull().sum()

Unnamed: 0     0
User.ID        0
Book.Title     0
Book.Rating    0
dtype: int64

In [14]:
book.duplicated().sum()

0

In [15]:
book[book.duplicated()].shape

(0, 4)

In [16]:
#getting column name
book.columns

Index(['Unnamed: 0', 'User.ID', 'Book.Title', 'Book.Rating'], dtype='object')

In [17]:
#renaming Columns
book.rename(columns = {'Unnamed: 0':'sr','User.ID':'userid','Book.Title':'title','Book.Rating':'rating'}, inplace = True)

In [18]:
book.head()

Unnamed: 0,sr,userid,title,rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6


In [19]:
#after colums
book.columns

Index(['sr', 'userid', 'title', 'rating'], dtype='object')

In [20]:
#title columns
book.title

0                                     Classical Mythology
1                                            Clara Callan
2                                    Decision in Normandy
3       Flu: The Story of the Great Influenza Pandemic...
4                                  The Mummies of Urumchi
                              ...                        
9995         American Fried: Adventures of a Happy Eater.
9996                                Cannibal In Manhattan
9997                      How to Flirt: A Practical Guide
9998                                             Twilight
9999                         Kids Say the Darndest Things
Name: title, Length: 10000, dtype: object

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer#term frequencey- inverse document frequncy is a numerical statistic 
#that is intended to reflect how important a word is to document in a 
#collecion or corpus

In [22]:
# Creating a Tfidf Vectorizer to remove all stop words
tfidf = TfidfVectorizer(stop_words="english")#taking stop words from tfid vectorizer 

In [23]:
# replacing the NaN values in overview column with
# empty string
book["title"].isnull().sum() 
book["title"] = book["title"].fillna(" ")

In [24]:
# Preparing the Tfidf matrix by fitting and transforming
tfidf_matrix = tfidf.fit_transform(book.title)#Transform a count matrix to a normalized tf or tf-idf representation

In [25]:
tfidf_matrix.shape #10000,11435

(10000, 11435)

In [26]:
# with the above matrix we need to find the 
# similarity score
# There are several metrics for this
# such as the euclidean, the Pearson and 
# the cosine similarity scores

# For now we will be using cosine similarity matrix
# A numeric quantity to represent the similarity 
# between 2 movies 
# Cosine similarity - metric is independent of 
# magnitude and easy to calculate 

# cosine(x,y)= (x.y⊺)/(||x||.||y||)

In [27]:
from sklearn.metrics.pairwise import linear_kernel

In [28]:
# Computing the cosine similarity on Tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix)

In [29]:
# creating a mapping of anime name to index number 
book_index = pd.Series(book.index,index=book['title']).drop_duplicates()


In [30]:
book_index["Clara Callan"]

1

In [31]:
def get_book_recommendations(title,topN):
    
   
    #topN = 10
    # Getting the movie index using its title 
    book_id = book_index[title]
    
    # Getting the pair wise similarity score for all the anime's with that 
    # anime
    cosine_scores = list(enumerate(cosine_sim_matrix[book_id]))
    
    # Sorting the cosine_similarity scores based on scores 
    cosine_scores = sorted(cosine_scores,key=lambda x:x[1],reverse = True)
    
    # Get the scores of top 10 most similar anime's 
    cosine_scores_10 = cosine_scores [ 0:topN+1 ]
    
    # Getting the anime index 
    book_idx  =  [i[0] for i in cosine_scores_10]
    book_scores =  [i[1] for i in cosine_scores_10]
    
    # Similar movies and scores
    book_similar_show = pd.DataFrame(columns=["title","rating"])
    book_similar_show["title"] = book.loc[book_idx,"title"]
    book_similar_show["rating"] = book_scores
    book_similar_show.reset_index(inplace=True)  
    book_similar_show.drop(["index"],axis=1,inplace=True)
    print (book_similar_show)
    #return (anime_similar_show)


In [32]:
# Enter your anime and number of anime's to be recommended 
get_book_recommendations("Love Story",topN=15)

                                              title    rating
0                                        Love Story  1.000000
1                                          MY STORY  0.717371
2                                I Love You So Much  0.696691
3                                    For Love Alone  0.696691
4                                              Love  0.696691
5                             You Are My I Love You  0.696691
6                           D/S: An Anti-Love Story  0.645391
7                            Drinking: A Love Story  0.635717
8                           My Sergei: A Love Story  0.623526
9                          My Sergei : A Love Story  0.623526
10                            Love to Love You Baby  0.598832
11                 The Bachelor's Cat: A Love Story  0.586780
12                      The Frog King: A Love Story  0.565694
13  The LAST STORY (REMEMBER ME 3) : THE LAST STORY  0.540167
14                         Only Love (Magical Love)  0.539739
15      