In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/goodreads-best-books-ever-with-recommendations/Goodreads_BestBooksEver_1-10000.csv')

In [None]:
df.head()

# Data Cleaning

In [None]:
df.drop_duplicates(subset = ['bookTitle'], keep = 'first', inplace = True, ignore_index = True) #Delete all duplicates based on book title, keeping the first instance only

In [None]:
df.dropna(subset = ['bookTitle', 'bookAuthors', 'bookGenres', 'bookDesc','recommendations'], inplace = True) #drop all the NaN values in columns that we use later

In [None]:
df.reset_index(inplace = True)

Next we will clean the **bookGenres** column so that it will show only the most voted genre. This is based on the fact that many books have genres that are only voted by a few people that it should be deemed insignificant. Moreover, many books only have one genre attached to it so to make life easy we will just take the first (and/or most voted) genre that appears in the list.

In [None]:
def split_genre(genres):
    return genres.split("|")[0].split("/")[0]

df['cleaned_bookGenres'] = df["bookGenres"].apply(split_genre)

Similarly we will clean the **recommendations** column, but we will just remove the URL and keep the book titles for later reference 

In [None]:
import ast
def split_recommendations(recommendations):
    recommendations = ast.literal_eval(recommendations)
    l = []
    for recommendation in recommendations:
        l.append(recommendation.split("|")[0])
    return l

df['cleaned_recommendations'] = df["recommendations"].apply(split_recommendations)

The recommendation by Goodreads includes of 18 books for each title. Next up we will create recommendation engines that also recommend the 18 most similar books to each title on the list. 

# Recommendation based on Author name

First let's try a (real) simple recommendation engine based on Author Name(s). Using this engine, we would expect to get recommended books from the same author when we look at a particular book.

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['bookAuthors'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
titles = df['bookTitle']
indices = pd.Series(df.index, index=df['bookTitle'])

# Function that get book recommendations based on the cosine similarity score of book authors
def authors_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:20]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

For an example, let's look at one of the books on the list "The Hunger Games"

In [None]:
list(authors_recommendations('Pride and Prejudice').head(18)) #first 18 most similar books based on our engine

In [None]:
list(df[df['bookTitle'] == 'Pride and Prejudice']['cleaned_recommendations'])[0] #18 books recommended by Goodreads

Comparing the 2 lists:

In [None]:
set(list(authors_recommendations('The Hunger Games').head(18))).intersection(list(df[df['bookTitle'] == 'The Hunger Games']['cleaned_recommendations'])[0])

There's no shared book between the two lists. Quite a bummer! Let's do this for all books and look at the stats. We will count how many books are the same between the two recommenders for each title. Note that we will normalize the result so that a score of 1 would mean 18/18 books in common for example. 

In [None]:
authors_comparison = [len(set(list(authors_recommendations(df.bookTitle[i]).head(18))).intersection(list(df[df['bookTitle'] == df.bookTitle[i]]['cleaned_recommendations'])[0]))/18 for i in range(len(df))]

In [None]:
from scipy import stats
stats.describe(authors_comparison)

A quick look reveals that the best result is 15 out of 18 books, but most of the time there's little to no similarity between our engine and the Goodreads' recommendation.

In [None]:
sns.set_context('notebook')
ax = sns.displot(authors_comparison, color = 'seagreen')
ax.fig.set_figwidth(15)
ax.fig.set_figheight(10)
plt.title('Author based Recommender Performance', size = 20)

# Recommendation based on Genre

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['cleaned_bookGenres'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
titles = df['bookTitle']
indices = pd.Series(df.index, index=df['bookTitle'])

# Function that get book recommendations based on the cosine similarity score of book authors
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:20]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [None]:
genre_recommendations('The Hunger Games').head(18)

In [None]:
genre_comparison = [len(set(list(genre_recommendations(df.bookTitle[i]).head(18))).intersection(list(df[df['bookTitle'] == df.bookTitle[i]]['cleaned_recommendations'])[0]))/18 for i in range(len(df))]

In [None]:
stats.describe(genre_comparison)

In [None]:
sns.set_context('notebook')
ax = sns.displot(genre_comparison, color = 'seagreen')
ax.fig.set_figwidth(15)
ax.fig.set_figheight(10)
plt.title('Genre based Recommender Performance', size = 20)

This engines maxes out at 10/18 books but has a much better mean than the author-based engine. Looking at the distribution plot, we can also see more clearly the bins between the min and the max values. 

# Recommendation based on Description

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['bookDesc'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
titles = df['bookTitle']
indices = pd.Series(df.index, index=df['bookTitle'])

# Function that get book recommendations based on the cosine similarity score of book authors
def desc_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:20]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [None]:
desc_comparison = [len(set(list(desc_recommendations(df.bookTitle[i]).head(18))).intersection(list(df[df['bookTitle'] == df.bookTitle[i]]['cleaned_recommendations'])[0]))/18 for i in range(len(df))]

In [None]:
stats.describe(desc_comparison)

In [None]:
sns.set_context('notebook')
ax = sns.displot(desc_comparison, color = 'seagreen')
ax.fig.set_figwidth(15)
ax.fig.set_figheight(10)
plt.title('Description based Recommender Performance', size = 20)

Best looking (subjectively) distribution plot so far with clear bins but a lower mean than the genre-based engine and only maxes out at 7/18 books.

# Recommendation based on Authors, Genres, and Description

In [None]:
df['corpus'] = pd.Series(df[['bookAuthors', 'cleaned_bookGenres', 'bookDesc']].fillna('').values.tolist()).str.join(' ')

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['corpus'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
titles = df['bookTitle']
indices = pd.Series(df.index, index=df['bookTitle'])

# Function that get book recommendations based on the cosine similarity score of book authors
def final_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:20]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [None]:
final_comparison = [len(set(list(final_recommendations(df.bookTitle[i]).head(18))).intersection(list(df[df['bookTitle'] == df.bookTitle[i]]['cleaned_recommendations'])[0]))/18 for i in range(len(df))]
stats.describe(final_comparison)

In [None]:
sns.set_context('notebook')
ax = sns.displot(data = final_comparison, color = 'seagreen')
ax.fig.set_figwidth(15)
ax.fig.set_figheight(10)
plt.title('Authors + Genre + Description based Recommender Performance', size = 20)

# Comparison between the engines

In [None]:
sns.set_context('notebook')

sns.kdeplot(authors_comparison)
sns.kdeplot(genre_comparison)
sns.kdeplot(desc_comparison)
sns.kdeplot(final_comparison)
plt.legend(labels=['author-based', 'genre-based', 'description-based','all'])
#ax.fig.set_figwidth(15)
#ax.fig.set_figheight(10)

It is hard to decide which one of these 4 engines perform the best. Based on the mean alone, the genre-based engine produces the best results. It is also the engine with the fewest 0/18 results. By contrast the author-based performs the worst with the most 0/18 results, as can be seen clearly on the plot above.

What we learn and know for sure after this is that Goodreads may not exactly use the kind of simple (or naive) recommendation system that we have here. There could be more sophisticated engine that they use or they could simply base their recommendations on users' history (i.e. if most people who like book A also like book B), or a combination of both. 