In [1]:
import pandas as pd
from ast import literal_eval

# Set the float format
pd.options.display.float_format = '{:.2f}'.format

# Import data from the goodbooks-10k repo
books_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv', index_col=[0], converters={"genres": literal_eval})
books_ratings = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/ratings.csv')

# Display three entries
books_df.head(3).T

Unnamed: 0,0,1,2
index,0,1,2
authors,['Suzanne Collins'],"['J.K. Rowling', 'Mary GrandPré']",['Stephenie Meyer']
average_rating,4.34,4.44,3.57
best_book_id,2767052,3,41865
book_id,1,2,3
books_count,272,491,226
description,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,Harry Potter's life is miserable. His parents ...,About three things I was absolutely positive.\...
genres,"[young-adult, fiction, fantasy, science-fictio...","[fantasy, fiction, young-adult, classics]","[young-adult, fantasy, romance, fiction, paran..."
goodreads_book_id,2767052,3,41865
image_url,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1361039443m...


In [2]:
# See what genres are numbers
non_list_entries = books_df[~books_df['genres'].apply(lambda x: isinstance(x, list))]
print(non_list_entries[['title', 'genres']])

Empty DataFrame
Columns: [title, genres]
Index: []


In [3]:
print(books_df['title'].sample(10))

8199    Island of the Blue Dolphins (Island of the Blu...
5658                        The Complete Poetry and Prose
6637                                    Mornings in Jenin
594                                                 Heidi
2900         The High King (The Chronicles of Prydain #5)
5143    First They Killed My Father: A Daughter of Cam...
4044                        The Never War (Pendragon, #3)
110     Wicked: The Life and Times of the Wicked Witch...
2253                     Princeps' Fury (Codex Alera, #5)
8278                                      The Vacationers
Name: title, dtype: object


Preprocessing
We're going to keep the following columns.

authors
average_rating
genres
language_code
title
description

In [4]:
# Columns of the dataset we are interested in for this model
columns_to_keep = \
  ['authors', 'average_rating', 'genres', 'language_code', 'title', 'description']

# Subset of the dataset with only the above columns
books_df_subset = books_df[columns_to_keep]

# Extract the first author of the authors list and use it.
books_df_subset['author'] = books_df_subset['authors'].apply(lambda x: x[0]).astype(str)
# Count of unique authors

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_subset['author'] = books_df_subset['authors'].apply(lambda x: x[0]).astype(str)


In [5]:
# Remove NaNs from the description column
books_df_subset['description'] = books_df_subset['description'].fillna('')

# Remove NaNs from the original_title column
books_df_subset['title'] = books_df_subset['title'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_subset['description'] = books_df_subset['description'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_subset['title'] = books_df_subset['title'].fillna('')


Feature extraction

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MultiLabelBinarizer

vectorizer = TfidfVectorizer()
hasher = FeatureHasher(n_features=count_of_unique_authors, input_type='string')
mlb = MultiLabelBinarizer()

# Hash the authors
#author_features = hasher.transform(books_df_subset['author'])

# Binarize the genres column
binarized_genres = mlb.fit_transform(books_df_subset['genres'])

# One-hot encode the language_code
books_df_subset = pd.get_dummies(books_df_subset, columns=['language_code'])

# Vectorize the title column
title_features  = vectorizer.fit_transform(books_df_subset['title'])

# Vectorize the description column
description_features = vectorizer.fit_transform(books_df_subset['description'])

NameError: name 'count_of_unique_authors' is not defined

In [7]:
from scipy.sparse import hstack

# Composite feature Vector
composite_feature_vector = hstack([binarized_genres, title_features, description_features])

NameError: name 'binarized_genres' is not defined

Similarity measure

Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(composite_feature_vector)

Euclidean measure

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(composite_feature_vector)

Manhattan distance

In [None]:
from sklearn.metrics.pairwise import manhattan_distances
manhattan_dist = manhattan_distances(composite_feature_vector)

Jaccard Similarity

In [None]:
from sklearn.metrics import jaccard_score
import numpy as np

composite_feature_vector = composite_feature_vector.toarray()
# Note: jaccard_score works a bit differently, so loop over pairs of rows in your data
jaccard_sim = \
 [jaccard_score(composite_feature_vector[i], composite_feature_vector[j]) for i in range(composite_feature_vector.shape[0]) for j in range(composite_feature_vector.shape[0])]

Test

In [None]:
indices = pd.Series(books_df_subset.index, index=books_df_subset['title']).drop_duplicates()

In [None]:
def recommend_items(title, cosine_sim=cosine_sim):
    # Get the index of the item that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all items with that item
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar items
    sim_scores = sim_scores[1:11]

    # Get the item indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar items
    return books_df_subset['title'].iloc[item_indices]