In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import string
import random

import warnings
warnings.filterwarnings("ignore")

from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from PIL import Image
from io import BytesIO
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_rows', 500)

# Content-based recommendation system


**Content-based recommendation systems** recommends items to a user by taking similarity of items, based on the description or features. It identifies the similarity between the products based on its description. It also considers the user previous history in order to recommend a similar product.

![](https://www.researchgate.net/profile/Barthelemy_Durette2/publication/275954089/figure/tbl2/AS:614304592166937@1523473036482/Formulae-of-the-similarity-and-distance-measures.png)
 <center> 
    <a href="https://www.researchgate.net/figure/Formulae-of-the-similarity-and-distance-measures_tbl2_275954089">Source</a> 
</center> 

For more information about this theme, click [here](https://medium.com/@gshriya195/top-5-distance-similarity-measures-implementation-in-machine-learning-1f68b9ecb0a3)

# Load data

In [None]:
raw_books = pd.read_csv('../input/7k-books-with-metadata/books.csv')
raw_books.head(3)

**According to the dataset author**: The dataset provides close to seven thousand books containing identifiers, title, subtitle, authors, categories, thumbnail url, description, published year, average rating, and number of ratings. The dataset is provided as comma-delimited CSV. 

- **isbn13**: ISBN 13 Identifier
- **isbn10**: ISBN 10 Identifier
- **title**: Title of book
- **subtitle**: Subtitle of book
- **authors**: Authors of book separated by ";"
- **categories**: Categories separated by ";"
- **thumbnail**: URL of thumbnail
- **description**: Description of book
- **published_year**: Year of publication
- **average_rating**: Average rating in Goodreads

# EDA

### Shape

In [None]:
print('ROWS: ', raw_books.shape[0])
print('COLUMNS: ', raw_books.shape[1])

This dataset contains a considerable amount of books.

### Categories (genres)

In [None]:
raw_books['categories'].nunique()

There are 567 different types of categories (genres) of books, which leads one to suspect that this dataset has genres written in the wrong way.

In [None]:
raw_books['categories'].unique()

As stated earlier, there are names in the categories (genres) column that are not real genres.

### Top 10 categories (genres)

In [None]:
raw_books['categories'].value_counts().head(10)

The genre "Fiction" has almost 5x more examples than the second place "Juvenile Fiction"

# Data Wrangling

### Select genres

Select only the books with four genres (**Fiction, Juvenile Fiction, Biography & Autobiography, History**) most present in the dataset.

In [None]:
books = raw_books.loc[raw_books['categories'].isin(['Fiction', 'Juvenile Fiction', 
                                                    'Biography & Autobiography', 'History'])]

In [None]:
books.tail(5)

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style('darkgrid')
sns.countplot(x=books['categories'], palette='Blues_r', edgecolor='black')

plt.show()

The dataset is unbalanced, however there is no problem for now, since we will only calculate the similarity distance.

### View title and description of two random examples

In [None]:
print(books['title'] [120])
print(books['description'] [120])
print('\n\n')
print(books['title'] [200])
print(books['description'] [200])

### Drop rows with **null values** in description

In [None]:

books.dropna(subset=['description'], inplace=True)

In [None]:
books['description'].isna().sum()

### Drop other rows

Exclude books whose description contains the number of print runs. For example: **"15,000 printing"**.

In [None]:
# USING REGULAR EXPRESSIONS (REGEX)
books = books[~books.description.str.contains('[0-9].*[0-9].*[printing]')]

# Top 20 bigrams

In [None]:
# CONVERT DESCRIPTION INTO VECTORS AND USE BIGRA,
tf = TfidfVectorizer(ngram_range=(2, 2), stop_words='english', lowercase=False)
tfidf_matrix = tf.fit_transform(books['description'])
total_words = tfidf_matrix.sum(axis=0)

# WORK FREQUENCY
freq = [(word, total_words[0, index]) for word, index in tf.vocabulary_.items()]
freq = sorted(freq, key=lambda x: x[1], reverse=True)

# CREATE A PANDAS DATAFRA,E
bigram = pd.DataFrame(freq)
bigram.rename(columns = {0:'bigram', 1: 'count'}, inplace = True) 

# TOP 20 BIGRAMS
bigram = bigram.head(20)

# PLOT BARPLOT
plt.figure(figsize=(8, 8))
sns.barplot(x=bigram['count'], y=bigram['bigram'], color='blue')
plt.show()

# Preprocessing

In [None]:
# REMOVE NON ASCII CHARACTERS
def remove_non_ascii(string):
    return "".join(c for c in string if ord(c) < 128)

# MAKE DESCRIPTION TEXT LOWER CASE
def make_lower_case(text):
    return text.lower()

# REMOVE STOP WORDS
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words('english'))
    text = [word for word in text if not word in stops]
    text = " ".join(text)
    return text

# REMOVE PUNCTUATIONS
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# REMOVE HTML CODES
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

### Create new column called "cleaned_description" and apply all functions

In [None]:
books['cleaned_description'] = books['description'].apply(remove_non_ascii)
books['cleaned_description'] = books.cleaned_description.apply(make_lower_case)
books['cleaned_description'] = books.cleaned_description.apply(remove_stop_words)
books['cleaned_description'] = books.cleaned_description.apply(remove_punctuation)
books['cleaned_description'] = books.cleaned_description.apply(remove_html)

# Make recommendations by title

In [None]:
def recommend(title, category):
    
    # MATCH THE CATEGORY WITH THE COLUMN "CATEGORIES" OF THE DATASET
    data = books.loc[books['categories'] == category] 
    # RESET INDEX
    data.reset_index(level = 0, inplace = True) 
    
    # INDEX TO A PANDAS SERIES
    indices = pd.Series(data.index, index = data['title'])
    
    # CONVERT THE BOOK TITLE INTO VECTORS AND USE BIGRAM
    tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), min_df = 1, stop_words='english')
    tfidf_matrix = tf.fit_transform(data['title'])
    
    # CALCULATE THE SIMILARITY MEASURE
    similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # GET THE INDEX OF ORIGINAL TITLE
    index = indices[title]
    
    # PAIRWISE SIMILARITY SCORES
    similarity = list(enumerate(similarity[index]))
    # SORT THE BOOKS
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    
    # GET TOP 5 MOST SIMILAR BOOKS
    similarity  = similarity [1:6]
    
    # INDICES OF TOP 5
    book_indices = [i[0] for i in similarity]

    # TOP 5 RECOMMENDATION
    rec = data[['title', 'thumbnail']].iloc[book_indices]
    
    # PRINT THE BOOKS TITLE
    print(rec['title'])
    
    # PRINT THE TOP 5 BOOK COVER
    for i in rec['thumbnail']:
        response = requests.get(i)
        img = Image.open(BytesIO(response.content))
        plt.figure()
        plt.imshow(img)

In [None]:
# TEST
recommend("A People's History of the United States", "History")

# Make recommendations by description

The function below does the same thing as the previous one, but this time the **book descriptio**n is used to make the recommendations.

In [None]:
def recommend(title, category):
    
    # MATCH THE CATEGORY WITH THE COLUMN "CATEGORIES" OF THE DATASET
    data = books.loc[books['categories'] == category] 
    # RESET INDEX
    data.reset_index(level = 0, inplace = True) 
    
    # INDEX TO A PANDAS SERIES
    indices = pd.Series(data.index, index = data['title'])
    
    # CONVERT THE BOOK TITLE INTO VECTORS AND USE BIGRAM
    tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), min_df = 1, stop_words='english')
    tfidf_matrix = tf.fit_transform(data['cleaned_description'])
    
    # CALCULATE THE SIMILARITY MEASURE
    similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # GET THE INDEX OF ORIGINAL TITLE
    index = indices[title]
    
    # PAIRWISE SIMILARITY SCORES
    similarity = list(enumerate(similarity[index]))
    # SORT THE BOOKS
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    
    # GET TOP 5 MOST SIMILAR BOOKS
    similarity  = similarity [1:6]
    
    # INDICES OF TOP 5
    book_indices = [i[0] for i in similarity]

    # TOP 5 RECOMMENDATION
    rec = data[['title', 'thumbnail']].iloc[book_indices]
    
    # PRINT THE BOOKS TITLE
    print(rec['title'])
    
    # PRINT THE TOP 5 BOOK COVER
    for i in rec['thumbnail']:
        response = requests.get(i)
        img = Image.open(BytesIO(response.content))
        plt.figure()
        plt.imshow(img)

In [None]:
# TEST
recommend("Taken at the Flood", "Fiction")

# Reference

This notebook is inspired by [Building a Content-Based Book Recommendation Engine](https://towardsdatascience.com/building-a-content-based-book-recommendation-engine-9fd4d57a4da) article from the medium.