<h1 align='center'> User Rating Exploratory Data Analysis: <br> Amazon Top 50 Bestselling Books 2009 - 2019 </h1>

This notebook performs univariate exploratory data analysis on the User Ratings of the Amazon Top 50 Bestselling Books dataset. 

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

palette = ['#F5DF4D', '#939597', '#0F4C81', '#FF6F61', '#5F4B8B', '#88B04B', '#92A8D1', '#F7CAC9', '#955251',\
           '#B163A3', '#009473']
palette.reverse()

## Import Data

In [None]:
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.head()

### Examine Basic Properties

In [None]:
df.describe()

In [None]:
df.info()

### Retype Columns as Needed and Reexamine

In [None]:
df['Year'] =  pd.to_datetime(df['Year'], format='%Y')
print(df.info())
df.head()

## Visualization

### Histograms

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

# Plot histogram using Seaborn
sns.histplot(df['User Rating'], stat='density', binwidth=0.1, color='#F5DF4D')
  
# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings" , size=24);

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

# Plot histogram using Seaborn
sns.histplot(df.loc[df['Genre'] == 'Fiction']['User Rating'], stat='density', binwidth=0.1, color='#F5DF4D')
sns.histplot(df.loc[df['Genre'] == 'Non Fiction']['User Rating'], stat='density', binwidth=0.1, color='#939597') 

# Create plot legend
plt.legend(labels=['Fiction', 'Non Fiction'])

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Genre" , size=24);

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Plot histogram using Seaborn
    sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i], alpha=0.5) 

# Create plot legend
plt.legend(labels=years)

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Year" , size=24);

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Plot histogram using Seaborn
    if i in range(5):
        sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Create plot legend
plt.legend(labels=years[0:5])

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Year 2009-2013" , size=24);

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Plot histogram using Seaborn
    if i in range(5,10):
        sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Create plot legend
plt.legend(labels=years[5:10])

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Year 2014-2018" , size=24);

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Plot histogram using Seaborn
    if i in range(8,12):
        sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Create plot legend
plt.legend(labels=years[8:12])

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Year 2017-2019" , size=24);

In [None]:
df[['Price']].describe()

In [None]:
df['Price Quantile'] = pd.qcut(df['Price'], 4, labels=False)

# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

prices = np.sort(df['Price Quantile'].unique())
for i, price in enumerate(prices):
    # Plot histogram using Seaborn
    sns.histplot(df.loc[df['Price Quantile'] == price]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Create plot legend
plt.legend(labels=prices)

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Price Quantile" , size=24);

In [None]:
df['Reviews Quantile'] = pd.qcut(df['Reviews'], 4, labels=False)

# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

reviews = np.sort(df['Reviews Quantile'].unique())
for i, review in enumerate(reviews):
    # Plot histogram using Seaborn
    sns.histplot(df.loc[df['Reviews Quantile'] == review]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Create plot legend
plt.legend(labels=reviews)

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Review Quantile" , size=24);

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

# Plot boxplot using Seaborn
sns.boxplot(data=df['User Rating'], color='#F5DF4D')
  
# Set label for x-axis
plt.xlabel( "User Rating" , size=12)

  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings" , size=24);

In [None]:
ax = df.boxplot('User Rating', 'Genre', figsize=(10,6), fontsize=12)
ax.set_ylabel('User Rating')
ax.set_title('');

In [None]:
df['Year'] = df['Year'].dt.year
ax = df.boxplot('User Rating', 'Year', figsize=(10,6), fontsize=16, rot=35)
ax.set_ylabel('User Rating')
ax.set_title('');

In [None]:
ax = df.boxplot('User Rating', 'Price Quantile', figsize=(10,6), fontsize=16, rot=35)
ax.set_ylabel('User Rating')
ax.set_title('');

In [None]:
ax = df.boxplot('User Rating', 'Reviews Quantile', figsize=(10,6), fontsize=16, rot=35)
ax.set_ylabel('User Rating')
ax.set_title('');