In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Importing the Data

In [None]:
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df

In [None]:
df.info()

In [None]:
### Price should be transformed to a float:

df['Price'] = df['Price'].astype(np.float64)
df.info()

In [None]:
### Observing the data after casting Price as a float:
df

**Checking for Duplicates**

In [None]:
# Looks like some Names are duplicated: See 546:549 above:
df['Name'].value_counts()

In [None]:
df.drop_duplicates(['Name'], keep = 'last', inplace = True)
df

In [None]:
df['Name'].value_counts().sort_values(ascending = False)

In [None]:
RevDuplicates = df['Reviews'].value_counts()
RevDuplicates[RevDuplicates > 1]



In [None]:
df[df['Reviews'] == 3503]

In [None]:
df[df['Reviews'] == 32122]

In [None]:
df[df['Reviews'] ==3759]

In [None]:
df[df['Reviews'] == 5505]

In [None]:
df[df['Reviews'] == 7251]

Looks like 3 of the "reviews with value counts greater than 2 are duplicate books.  Assuming more expensive versions are Hard Cover copies.

***(Does anyone know an easier way to loop through RevDuplicates and view one DataFrame that shows all the above?***

### Year by Year Analysis

In [None]:
### Total Reviews from 2009-2019

ReviewsPerYear = df.groupby('Year')['Reviews'].sum()
ReviewsPerYear

In [None]:
ReviewsPerYear.sort_values(ascending = False)

In [None]:
fig, ax = plt.subplots()
ax.plot(ReviewsPerYear)
ax.set_xlabel('Years')
ax.set_ylabel('Total Reviews')
ax.set_title('Total Reviews from 2009 - 2019')
ax.set_xticks(ReviewsPerYear.index)
plt.show()

In [None]:
# Total Reviews from 2009 - 2019 by Genre

df.groupby('Genre')['Reviews'].sum()

In [None]:
ReviewsByGenre = df.groupby(['Year','Genre'])['Reviews'].sum().unstack()
ReviewsByGenre

In [None]:
fig, ax = plt.subplots()
ax.plot(ReviewsByGenre['Fiction'], label = 'Fiction')
ax.plot(ReviewsByGenre['Non Fiction'], label = 'Non Fiction')
ax.set_xlabel('Years')
ax.set_ylabel('Total Reviews')
ax.set_title('Total Reviews from 2009 - 2019')
ax.set_xticks(ReviewsByGenre.index)
ax.legend(loc = 'best')
plt.show()

**Most Reviewed Fiction Authors from 2009 - 2019**

In [None]:
#Most Reviewed Fiction Authors from 2009 - 2019
Fiction = df[df['Genre'] == 'Fiction']
FictionAuthorReivews = Fiction.pivot_table(columns = ['Year'], index = ['Author'], values = ['Reviews'], fill_value = 0, aggfunc = sum, margins = True)
FictionAuthorReivews.sort_values(by = ('Reviews', 'All'), ascending = False)[1:11]

**Most Reviewed Non Fiction Authors from 2009 - 2019**

In [None]:
NonFiction = df[df['Genre'] == 'Non Fiction']
NonFictionAuthorReivews = NonFiction.pivot_table(columns = ['Year'], index = ['Author'], values = ['Reviews'], fill_value = 0, aggfunc = sum, margins = True)
NonFictionAuthorReivews.sort_values(by = ('Reviews', 'All'), ascending = False)[1:11]

In [None]:
Fiction[['Genre', 'Author','Reviews']].groupby(['Genre', 'Author']).sum().sort_values(by = 'Reviews', ascending = False).head(10)

In [None]:
NonFiction[['Genre', 'Author','Reviews']].groupby(['Genre', 'Author']).sum().sort_values(by = 'Reviews', ascending = False).head(10)

**Most Reviewed Fiction Titles ('Names')**

In [None]:
Fiction[['Author', 'Name', 'Reviews']].groupby(['Author','Name']).sum().sort_values(by = 'Reviews', ascending = False).head(15)

**Most Reviewed NonFiction Titles ('Names')**

In [None]:
NonFiction[['Author', 'Name', 'Reviews']].groupby(['Author','Name']).sum().sort_values(by = 'Reviews', ascending = False).head(16)

**Average Book Price per Year**

In [None]:
AvgPricebyGenre = df[['Genre','Year','Price']].groupby(['Genre','Year']).mean().unstack(0)
AvgPricebyGenre

In [None]:
AvgPricebyGenre.mean()

In [None]:
AvgPricebyGenre.plot.bar(title = 'Average Price by Genre by Year',
                        ylabel = 'Price in USD')
plt.show()

**Average User Rating Distribution by Genre**

In [None]:
fig, ax = plt.subplots(tight_layout=True)
ax.hist(Fiction['User Rating'], alpha = 0.7, label = 'Fiction')
ax.hist(NonFiction['User Rating'], alpha = 0.7, label = 'NonFiction')
ax.set_title('Distribution of Average User Rating by Genre')
ax.legend(loc = 'best')
ax.set_xlabel('User Rating')
ax.set_ylabel('User Rating Count')
plt.show()

**Most Expensive Authors**

In [None]:
df.sort_values(by = 'Price', ascending = False)[:10]