In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The aim of this notebook is to take an explorative data analysis of the Amazon Top 50 Bestselling Books 2009 - 2019 dataset. I'm new to data analysis and data science but I will try to explore public interest in reading.

# Quick analysis of the data

In [None]:
dataset = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv',',')
dataset.head(10)

In [None]:
print('Dataset contains %d observations with %d variables' % (dataset.shape[0], dataset.shape[1]))
print(dataset.dtypes)

The dataset contains 550 observations with 7 variables :

|Variable name|Type|
|:--:|:--:|
|Name|qualitative|
|Author|qualitative|
|User Rating|quantitative|
|Reviews|quantitative|
|Price|quantitative|
|Year|qualitative|
|Genre|qualitative|     

In [None]:
dataset.describe()

Among all these 550 bestselling books in the past 11 years, the median rating score was 4.7 while the lowest rating was only 3.3 (I guess those who bought it was dispointed).  

We can see that the readers are quite active to give their reviews. The hottest book got even 87841 reviews, 10 times greater than the median level, while the least was only 37 (but it might not be the one rated 3.3)!  

Average price of these book was 11 dollars. The most expensive book was 105 dollars while the cheapest one was for free ! Seriously ?? Or it was an anomaly. 

Lastly, the year of the bestselling list will be explored later.

In [None]:
dataset.describe(include='object')

We can observe that there are :  
- 351 unique book names. "Publication Manual of the American Psychological Association, 6th Edition" entered into the TOP50 bestselling list for 10 times.   
- 248 unique author names. Jeff Kinney's works entered into the list for 12 times. We can further explore which books they are.
- only 2 kind of books among which more than half were non fiction.

## Anormaly processing
We can see in the dataframe description that some items are priced at 0, which is not normal. Perhaps they are freebies. We  process it for a better price-value analysis.  
These books are listed below :

In [None]:
dataset[dataset.Price == 0].drop_duplicates(subset='Name',keep = 'last',inplace = False)

## Missing value processing

In [None]:
dataset.isnull().sum(axis=0)

The dataset has no missing values.

# General analysis and Visualization

In [None]:
ncol = ['User Rating','Reviews','Price']
for i in range(len(ncol)):
    data_extra = dataset.loc[:,ncol[i]]
    plt.subplot(1,3,i+1)
    plt.boxplot(data_extra.values)
    plt.xlabel(ncol[i])

plt.subplots_adjust(left=5,
                    bottom=0.1, 
                    right=8, 
                    top=0.9, 
                    wspace=0.5, 
                    hspace=0)
plt.show()

In [None]:
yearfiction = dataset.groupby(['Year','Genre']).size()

plt.title('Fiction/Non fiction books by year')
plt.plot(range(2009,2020),yearfiction.values[0:-1:2],label='Fiction')
plt.plot(range(2009,2020),yearfiction.values[1::2],label='Non Fiction')
plt.xlabel('Year')
plt.ylabel('Number of bestselling books')
plt.legend()
plt.show()

Non fictions are generally more popular than fictions. A peak of fiction books occured in 2014, but the next year, non fiction book surpassed it again and soared to a new historical peak.

In [None]:
data_filter = dataset.drop_duplicates(subset='Name',keep = 'last',inplace = False)
df = data_filter.iloc[:,2:5]
plt.figure()
sns.heatmap(df.corr(),annot=True, vmax=1, square=True, cmap="Blues")
plt.show()

In [None]:
plt.subplot(1,3,1)
plt.scatter(df['Reviews'], df['User Rating'])
plt.xlabel('Reviews')
plt.ylabel('User Rating')
plt.subplot(1,3,2)
plt.scatter(df['Reviews'], df['Price'])
plt.xlabel('Reviews')
plt.ylabel('Price')
plt.subplot(1,3,3)
plt.scatter(df['Price'], df['User Rating'])
plt.xlabel('Price')
plt.ylabel('User Rating')
plt.subplots_adjust(left=5,
                    bottom=0.1, 
                    right=8, 
                    top=0.9, 
                    wspace=0.5, 
                    hspace=0)
plt.show()

These tree quantitative variables are not strongly correlated.

# Deeper analysis
## Most popular authors

In [None]:
freq = dataset.Author.value_counts()
freq

In [None]:
names = freq.index[freq.values > 10]

for name in names: print('Author name : %s , Times : %d' % (name,freq[name]))
    
print('Authors who have at least one book in the TOP50 bestselling list each year (2009-2019) : ',end='')
for name in names: 
    if len(np.unique(dataset.Year[dataset.Author==name])) > 10:
        print('%s, ' % name,end='')

In [None]:
dataset.iloc[dataset.Author.isin(['Jeff Kinney', 'Gary Chapman']).values,:].sort_values(by='Author')

Jeff Kinney and Gary Chapman are the two authors whose book entered in the bestselling list every year. The former is famous for fiction book for children, the latter is most noted for his *The Five Love Languages series* regarding human relationships. 

## Most popular books
### Best rated books

In [None]:
dataset[dataset['User Rating'] == dataset['User Rating'].max()]

In [None]:
dataset[dataset['User Rating'] == dataset['User Rating'].min()]

Like what I guessed before, the under rated book doesn't has the least reviews. I don't know this book, but what happended to J.K. Rowling > < ?

### Most reviewed books

In [None]:
dataset[dataset.Reviews == dataset.Reviews.max()]

In [None]:
print(dataset.Name[dataset.Reviews == dataset.Reviews.min()].values)
dataset[dataset.Reviews == dataset.Reviews.min()]

### Most expensive books

In [None]:
dataset[dataset.Price == dataset.Price.max()]

### Price-Value analysis
Let's see if there is a price-value trade-off and find the good pricing and good ratings books. We will create a new criterion by means of Bayesian Average. For more detailed explainations, please find the link in the section "Reference".

#### TOP 10 Best rated books with weighted ratings

In [None]:
data_filter2 = data_filter[data_filter.Price != 0]
b = data_filter2['Reviews'].mean()
C = data_filter2['User Rating'].mean()
a = data_filter2['Reviews'].values
S = data_filter2['User Rating'].values

w_rating = np.zeros(len(data_filter2['Name']))
for i in range(len(data_filter2['Name'])):
    w_rating[i] = (S[i] * a[i]/(a[i] + b)) + (C * b/(a[i] + b))
data_filter2['Weighted Rating'] = w_rating

sorted_data = data_filter2.sort_values(by='Weighted Rating', ascending=False)
sorted_data.head(10)

In [None]:
top10_rating_book = data_filter2.nlargest(10,['Weighted Rating'])
plt.figure(figsize=(15,5))
sns.barplot(top10_rating_book['Name'], top10_rating_book['Weighted Rating'], alpha=0.8)
plt.title('Top 10 Books with Best Ratings In Amazon Best Seller 2009-2019')
plt.xticks(rotation=90)
plt.ylim(top10_rating_book['Weighted Rating'].min()-0.01,top10_rating_book['Weighted Rating'].max()+0.01)
plt.xlabel('Book names')
plt.ylabel('Weighted Ratings')
plt.show()

In [None]:
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot(1,1,1)
plt.scatter(data_filter2['Weighted Rating'],data_filter2['Price'])
plt.plot([4.1,4.1],[0,data_filter2['Price'].max() + 10], linewidth=0.2, color='red')
plt.plot([3.2,5.0],[data_filter2['Price'].max()/2 + 5,data_filter2['Price'].max()/2 + 5], linewidth=0.2, color='red')
plt.xlim(3.2,5.0)
plt.ylim(0,data_filter2['Price'].max() + 10)


plt.text(3.6, 30,'The Lucky Ones That Made It',fontsize=12,ha='center')
plt.text(3.6, 80,'Not Worth It',fontsize=12,ha='center')
plt.text(4.6, 30,'Worth It',fontsize=12,ha='center')
plt.text(4.6, 80,'Deservedly Expensive',fontsize=12,ha='center')

ax.set(title='Amazon Best Selling Book Price - Value Distribution',xlabel='Weighted Ratings',ylabel='Price')
plt.show()

We can see that almost all books listed have a good price-value performance. 

## Interesting discovery

Like what we have seen before, the American Psychological Association has come up 10 times in the author list. The book is "Publication Manual of the American Psychological Association, 6th Edition". It seems that a lot of writers, editors, or students bought it for developping better writing skill.

In [None]:
print(np.unique(dataset.Name[dataset.Author == 'American Psychological Association']))
dataset[dataset.Author == 'American Psychological Association']

# Reference
- The idea of the price-value analysis derives from **paotografi**'s notebook : https://www.kaggle.com/paotografi/amazon-2009-2019-best-selling-book-eda. For more detailed explainations : https://www.codementor.io/@arpitbhayani/solving-an-age-old-problem-using-bayesian-average-15fy4ww08p.


Not finished :)   

Next step :
- find top 10 best rated or most reviewed authors with weighted ratings;  
- presente more beautiful figures.  

Please give me some advices about the ideas, codes, my English... :)  
If you like this notebook, please give a vote. Thank you ! :)