In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
from PIL import Image
import requests
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [2]:
book_df = pd.read_csv(r"Books.csv")
rating_df = pd.read_csv(r"Ratings.csv")
users_df = pd.read_csv(r"Users.csv")

In [3]:
book_df.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [4]:
book_df.shape

(271360, 8)

In [5]:
book_df.isna().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [6]:
book_df["ISBN"].nunique()

271360

In [7]:
book_df = book_df.drop(["Image-URL-S", "Image-URL-M"], axis=1)

In [8]:
book_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [9]:
rating_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [10]:
rating_df.shape

(1149780, 3)

In [11]:
rating_df.isna().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [12]:
rating_df["ISBN"].nunique()

340556

In [13]:
print("Max Rating: ", rating_df["Book-Rating"].max(), "\nMin Rating: ", rating_df["Book-Rating"].min())

Max Rating:  10 
Min Rating:  0


In [14]:
rating_df = rating_df[rating_df["Book-Rating"]>4]
rating_df.shape

(414242, 3)

In [15]:
df = book_df.merge(rating_df, on="ISBN")

In [16]:
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,User-ID,Book-Rating
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,8
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,9
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,9


In [17]:
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.drop(columns=["ISBN","Year-Of-Publication"],axis=1,inplace=True)
df["Book-Title"]=df["Book-Title"].apply(lambda x: re.sub("[\W_]+"," ",x).strip())

df.head()

Unnamed: 0,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating
0,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5
1,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8
2,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,8
3,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,9
4,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,9


In [18]:
def popular_books(df, n=100):
    rating_count = df.groupby("Book-Title").count()["Book-Rating"].reset_index()
    rating_count.rename(columns={"Book-Rating":"NumberOfVotes"}, inplace=True)
    
    rating_average = df.groupby("Book-Title")["Book-Rating"].mean().reset_index()
    rating_average.rename(columns={"Book-Rating":"AverageRatings"},inplace=True)
    
    popularBooks = rating_count.merge(rating_average, on="Book-Title")
    
    def weighted_rate(x):
        v = x["NumberOfVotes"]
        R = x["AverageRatings"]   
        return ((v*R) + (m*C)) / (v+m)
    
    C = popularBooks["AverageRatings"].mean()
    m = popularBooks["NumberOfVotes"].quantile(0.90) 
    
    popularBooks = popularBooks[popularBooks["NumberOfVotes"] >=250]
    popularBooks["Popularity"] = popularBooks.apply(weighted_rate, axis=1)
    popularBooks=popularBooks.sort_values(by="Popularity", ascending=False)
    
    return popularBooks[["Book-Title","NumberOfVotes","AverageRatings","Popularity"]].reset_index(drop=True).head(n)

In [19]:
popular_books(df, 10)

Unnamed: 0,Book-Title,NumberOfVotes,AverageRatings,Popularity
0,Harry Potter and the Prisoner of Azkaban Book 3,277,9.043321,9.020034
1,To Kill a Mockingbird,265,9.015094,8.991295
2,Harry Potter and the Sorcerer s Stone Harry Po...,312,9.009615,8.989431
3,Harry Potter and the Chamber of Secrets Book 2,325,8.855385,8.838332
4,The Da Vinci Code,482,8.578838,8.570122
5,The Secret Life of Bees,398,8.575377,8.564887
6,The Red Tent Bestselling Backlist,369,8.363144,8.354678
7,The Notebook,268,8.328358,8.317398
8,The Lovely Bones A Novel,692,8.299133,8.29505
9,Life of Pi,322,8.295031,8.28639


In [20]:
def item_based(bookTitle):
    bookTitle = str(bookTitle)
    
    if bookTitle in df["Book-Title"].values:
        rating_count = pd.DataFrame(df["Book-Title"].value_counts())
        rare_books = rating_count[rating_count["Book-Title"]<=200].index
        common_books = df[~df["Book-Title"].isin(rare_books)]
        
        if bookTitle in rare_books:
            print("No Recommendations for this Book \nYOU MAY TRY: \n ")
            print(common_books[["Book-Title"]].sample(3))
        
        else:
            common_books_pivot = common_books.pivot_table(index=["User-ID"], columns=["Book-Title"], values="Book-Rating")
            title = common_books_pivot[bookTitle]
            recommendation_df = pd.DataFrame(common_books_pivot.corrwith(title).sort_values(ascending=False)).reset_index(drop=False)
            
            if bookTitle in [title for title in recommendation_df["Book-Title"]]:
                recommendation_df = recommendation_df.drop(recommendation_df[recommendation_df["Book-Title"] == bookTitle].index[0])
                
            less_rating = []
            for i in recommendation_df["Book-Title"]:
                if df[df["Book-Title"]==i]["Book-Rating"].mean() < 5:
                    less_rating.append(i)
            if recommendation_df.shape[0] - len(less_rating) > 5:
                recommendation_df = recommendation_df[~recommendation_df["Book-Title"].isin(less_rating)]
            recommendation_df.columns=["Book-Title","Correlation"]   
            print(recommendation_df[:6])
            
    else:
        print(" COULD NOT FIND ")

In [21]:
item_based("Clara Callan")

No Recommendations for this Book 
YOU MAY TRY: 
 
                              Book-Title
39012                     Fahrenheit 451
12091  The Red Tent Bestselling Backlist
7392                     A Painted House


In [22]:
item_based("From One to One Hundred")

 COULD NOT FIND 


In [23]:
item_based("To Kill a Mockingbird")

                                          Book-Title  Correlation
1                                           The Firm     0.798596
2                                 1st to Die A Novel     0.769060
3                          The Nanny Diaries A Novel     0.754599
4                She s Come Undone Oprah s Book Club     0.611124
5  The Fellowship of the Ring The Lord of the Rin...     0.582661
6                                      Jurassic Park     0.570756


In [24]:
def content_based(bookTitle):
    bookTitle = str(bookTitle)
    
    if bookTitle in df["Book-Title"].values:
        rating_count = pd.DataFrame(df["Book-Title"].value_counts())
        rare_books = rating_count[rating_count["Book-Title"]<=200].index
        common_books = df[~df["Book-Title"].isin(rare_books)]
        
        if bookTitle in rare_books:
            print("No Recommendations for this Book \nYOU MAY TRY: \n ")
            print(common_books[["Book-Title"]].sample(3))
            
        else:
            common_books = common_books.drop_duplicates(subset=["Book-Title"])
            common_books.reset_index(inplace=True)
            
            common_books["index"] = [i for i in range(common_books.shape[0])]
            targets = ["Book-Title","Book-Author","Publisher"]
            common_books["all_features"] = [" ".join(common_books[targets].iloc[i,].values) for i in range(common_books[targets].shape[0])]
            
            vectorizer = CountVectorizer()
            common_booksVector = vectorizer.fit_transform(common_books["all_features"])
            
            similarity = cosine_similarity(common_booksVector)
            
            index = common_books[common_books["Book-Title"]==bookTitle]["index"].values[0]
            similar_books = list(enumerate(similarity[index]))
            similar_booksSorted = sorted(similar_books,key=lambda x:x[1],reverse=True)[1:6]
            
            books=[]
            for i in range(len(similar_booksSorted)):
                books.append(common_books[common_books["index"]==similar_booksSorted[i][0]]["Book-Title"].item())
            
            print("You May Like: ", books)
    else:
        print(" COULD NOT FIND ") 

In [25]:
content_based("The Da Vinci Code")

You May Like:  ['The Catcher in the Rye', 'The Brethren', 'The Firm', 'The Partner', 'Angels amp Demons']


In [26]:
content_based("Tuesdays with Morrie An Old Man a Young Man and Life s Greatest Lesson")

You May Like:  ['The Five People You Meet in Heaven', 'The Brethren', 'Life of Pi', 'The Firm', 'The Partner']


In [27]:
content_based("A Soldier of the Great War")

No Recommendations for this Book 
YOU MAY TRY: 
 
                                             Book-Title
12859                                        Life of Pi
144190                          The Secret Life of Bees
56760   Harry Potter and the Prisoner of Azkaban Book 3
