In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

The above blocks of code give a quick glimpse into the dataset we are going to explore. The objective of the following blocks to do this exploration as best as we can, possibly with interactive visualizations. 
We hope these visualization will present us with new insights about the underlying data.

As I give a quick look at the dataset columns, the things I would want to know more about are these:
1. What is proportion of non fiction and fiction in the mix? Do they vary on an yearly basis? Which genre shows the most number of bestsellers over the years?
2. How is that lowest count in reviews column is 37?
3. Are there books or authors that seem to be repeating in their best seller status over these years?
4. What is the average price among the bestsellers over the years? Same query applies for average rating, and average number of reviews. Is there any trend seen?
5. Is there a correlation between the user rating and the price of the books?
6. Similarly is there a correlation between reviews and price columns? I am holding an assumption here that more reviews signify wider readership, ergo popularity. 
7. Finally, How do wordclouds generated from the book titles and author names look like? Are some words more prominent than others?
    

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# generating a cloud map of authors name
#mpl.rcParams['figure.figsize']=(8.0,6.0)    #(6.0,4.0)
mpl.rcParams['font.size']=12                #10 
mpl.rcParams['savefig.dpi']=100             #72 
mpl.rcParams['figure.subplot.bottom']=.1 


stopwords = set(STOPWORDS)

wordcloud = WordCloud(background_color='white',
                          stopwords={},
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(df['Author']))

#print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("bookauthors.png", dpi=900)

In [None]:
# generating a cloud map of book titles
wordcloud_2 = WordCloud(background_color='white',
                          stopwords=stopwords,
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(df['Name']))

#print(wordcloud)
fig = plt.figure(2)
plt.imshow(wordcloud_2)
plt.axis('off')
plt.show()
fig.savefig("booktitles.png", dpi=900)

Even though the wordclouds script is running well, they do not add much information, apart from magnification of words such as Doubting,Badass, and Stop. Similar inference applies for author name wordcloud. The enlarged word here is Sincero.

In [None]:
# Grouping the dataset by years, and then sorting by rating.

grouped_years = df.groupby(df.Year)
years = range(2009, 2020)
df_yearwise = []
for year in years:
    df_new = grouped_years.get_group(year).sort_values('User Rating', ascending=False).reset_index(drop=True) 
    df_yearwise.append(df_new)

In [None]:
# Before we do some trend analysis, 
# let's have some look at the genre proportion
store = df.Genre.value_counts(normalize=True).round(3)*100

print("Percent of each genre in the bestseller list 2009-2019")
store

In [None]:
# Genre trend analysis
genre_unstacked = df.groupby('Year').Genre.value_counts().unstack()
print(genre_unstacked)
fig = plt.figure(3)
genre_unstacked.plot(kind='bar',figsize=(6,10))
plt.show()
fig.savefig("trend_genre.png", dpi=900)

The above code made me realize that I don't really need to split the data set into different years for trend analysis. I can instead use the unstack function. Now let's get crazy the do same for rest of the columns.

In [None]:
# Trend analysis of average price, user rating, and total reviews
price_unstacked = df.groupby(['Year','Genre']).Price.mean().unstack()
print(price_unstacked)
fig = plt.figure(4)
price_unstacked.plot(kind='line', title = 'Trend in price of the best sellers')
plt.show()
fig.savefig("trend_price.png", dpi=900)

In [None]:
# Trend analysis of user rating, and total reviews
rating_unstacked = df.groupby(['Year','Genre'])['User Rating'].mean().unstack()
print(rating_unstacked)
fig = plt.figure(5)
rating_unstacked.plot(kind='line', title = 'Trend in rating of the best sellers')
plt.show()
fig.savefig("trend_rating.png", dpi=900)

In [None]:
# Trend analysis of user rating, and total reviews
reviews_unstacked = df.groupby(['Year','Genre'])['Reviews'].mean().round().unstack()
print(reviews_unstacked)
fig = plt.figure(6)
reviews_unstacked.plot(kind='line', title = 'Trend in average number of reviews of the best sellers')
plt.show()
fig.savefig("trend_reviews.png", dpi=900)

The above charts give us a general overview about how each of the features varied for the bestsellers over the years.
Finally let's check if any of the authors or books are repeatedly nominated as the best sellers.

In [None]:
# Repeated authors
authors_temp = df['Author'].value_counts(ascending=False)
repeated_authors = authors_temp.drop(labels=authors_temp[authors_temp==1].index)

In [None]:
repeated_authors

In [None]:
# Repeated book titles
titles_temp = df['Name'].value_counts(ascending=False)
repeated_titles = titles_temp.drop(labels=titles_temp[titles_temp==1].index)
repeated_titles.count()

In [None]:
repeated_titles.head()

This EDA gives us an overview of how much information can be gathered from even from a small scale dataset as above. As the final note we see repetition in some of the author's and title's bestselling status. It would be interesting to see which years they maintain, are they adjacent, or if the distribution is normal.

With this I end this notebook. 
Happy learning!.