In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

In [None]:
fname = '/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv'
df = pd.read_csv(fname)

# User Review (90 percentile-distribution over years) - just out of interest

- time series seems (visually) quiet stationary - further tests required

In [None]:
fig, ax  = plt.subplots(1,1,figsize=(10,8))

user_rating_year = (
    df.groupby('Year')
        .agg({
            'User Rating': [
                lambda x: x.quantile(0.05),
                lambda x: x.quantile(0.95),
                'median',
                'mean',
            ]
        })
        .rename(columns={
            '<lambda_0>': 'min',
            '<lambda_1>': 'max',
        })
)
ax.set_title('User Rating vs year')
ax.fill_between(user_rating_year.index, user_rating_year['User Rating', 'min'], user_rating_year['User Rating', 'max'], label='90%')
sns.lineplot(x=user_rating_year.index, y=user_rating_year['User Rating','median'], ax=ax, color='r', label='median')
sns.lineplot(x=user_rating_year.index, y=user_rating_year['User Rating','mean'], ax=ax, color='y', label='mean')
plt.legend()
ax.set_xlim((2009, 2019))
ax.set_ylim((0, 5))
ax.set_ylabel('User Rating')

# Book title analysis - unsupervised attempts via rudimentary nlp

### is it all possible - without any further info, such as e.g. book summaries - to extract insights/pattern from the top50 books (disregarding the time component)?

# 1. tfidf

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, TfidfModel
from gensim import similarities
from gensim.parsing.preprocessing import (
    remove_stopwords,
    preprocess_string,
    strip_punctuation,
    strip_numeric,
    strip_non_alphanum,
    strip_short,
)

custom_filters = (
    lambda s: s.lower(),
    strip_numeric,
    remove_stopwords,
    strip_punctuation,
    lambda s: strip_short(s, minsize=3),
)
df['pre_name'] = df.Name.apply(
    lambda n: preprocess_string(n, custom_filters)
)

In [None]:
dct = Dictionary(df.pre_name)
corpus = [dct.doc2bow(n) for n in df.pre_name]
tfidf = TfidfModel(corpus)
index = similarities.MatrixSimilarity(tfidf[corpus])
#str_check = (
#    df.Name
#    .apply(strip_punctuation)
#    .apply(strip_non_alphanum)
#    .str
#    .replace(' ', '')
#    .str
#)
df['name_isalpha'] = str_check.isalpha().astype(int)
df['name_isalphanumeric'] = str_check.isalnum().astype(int)

In [None]:
import umap
data = umap.UMAP().fit_transform(
    index[tfidf[corpus]]
)

In [None]:
pts = hv.Points(data)
labels = hv.Labels({
    ('x', 'y'): data,
    'text': df.Name,
}, ['x', 'y'], 'text')
overlay = (pts * labels)
overlay.opts(
    opts.Labels(text_font_size='8pt', xoffset=0.08),
)
overlay.options(
    title='Dim. reduced similarity matrix based on tf-idf',
    width=1000,
    height=800
)

### feel free to explore the plot:
#### it is interesting to see that such simple approach based solely term and inverse document frequency (without any futher magic such as word2vec) with minimal preprocessing can already give some clues about cluster:
#### - it graps the idea that for instance "Laugh-Out-Loud Jokes for Kids" and "Knock-Knock Jokes for Kids" are very similar and significantly different from other titles
#### - multiple cookbooks and diets are often clustered together, same goes with coloring books ("Unicorn coloring book" and "Adult Coloring Books") - did not know they were this popular
 #### - otherwise kind of mixed bag, requires further processing, e.g. books that contain suffixes such ": A Memoir" or ": A Novel" form cluster and should have removed duplicate titles


# word2vec

In [None]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
#fasttext_vectors = gensim.downloader.load('fasttext-wiki-news-subwords-300')#

# compute word distance based distance map (earth mover distance/wasserstein distance)

In [None]:
#N = df.shape[0]
#dmap = np.zeros((N,N))
#for i in range(N):
#    for j in range(N):
#        dmap[i,j] = fasttext_vectors.wmdistance(
#            df.pre_name.iloc[i], 
#            df.pre_name.iloc[j],
#        )

In [None]:
#dmap[dmap == np.inf] = np.median(dmap) # impute inf val

# visualize - MDS would have probably been more suitable for dmap

In [None]:
#data = umap.UMAP().fit_transform(dmap)
#pts = hv.Points(data)
#labels = hv.Labels({
#    ('x', 'y'): data,
#    'text': df.Name,
#}, ['x', 'y'], 'text')
#overlay = (pts * labels)
#overlay.opts(
#    opts.Labels(text_font_size='8pt'),
#)
#overlay.options(
#    width=1000,
#    height=800
#)