In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

# **Lecture du dataset :**

In [None]:
amazon = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')

In [None]:
display(amazon.head())

sns.heatmap(amazon.isnull())
plt.show()

print(amazon.shape, "\n")
print(amazon.info(), "\n")
print(amazon.describe())

plt.show()

In [None]:
amazon.columns

In [None]:
print("Name:", '\n', amazon['Name'].value_counts())

Il semblerait qu'il y ai des doublons dans le dataset. Clarifions la situation.

In [None]:
display(amazon[amazon['Name']=='Publication Manual of the American Psychological Association, 6th Edition'].head())
display(amazon[amazon['Name']=='StrengthsFinder 2.0'].head())
display(amazon[amazon['Name']=="Oh, the Places You'll Go!"].head())

# Dropping duplicates values
print("shape of the dataset before dropping duplicates is : {}".format(amazon.shape))
amazon.drop_duplicates(inplace=True)
print("shape of the dataset after dropping duplicates is : {}".format(amazon.shape))

Il y a effectivement des doublons dans le dataset, cependant, ce sont des livres qui ont été publiés à différentes reprises (différentes versions du même livre). 
La fonction "drop the duplicates" n'a retiré aucune valeur, on considère donc le dataset sans doublon.

In [None]:
print("Genre:", '\n', amazon['Genre'].value_counts())

sns.countplot('Genre', data=amazon, palette='Set3')
plt.show()

**Données chiffrées (Prix, Reviews et User Ratings) :**

In [None]:
numbers = ['User Rating', 'Reviews', 'Price']

amazon.loc[:,numbers].hist(color='salmon', figsize=(20,10), edgecolor='black', bins=10)
plt.show()

rating_binned = pd.cut(amazon['User Rating'], bins=10)
rating_binned = pd.DataFrame(pd.DataFrame(rating_binned).groupby('User Rating').size(), columns=['User Rating Count']).reset_index()

Reviews_binned = pd.cut(amazon['Reviews'], bins=10)
Reviews_binned = pd.DataFrame(pd.DataFrame(Reviews_binned).groupby('Reviews').size(), columns=['Reviews Count']).reset_index()

Price_binned = pd.cut(amazon['Price'], bins=10)
Price_binned = pd.DataFrame(pd.DataFrame(Price_binned).groupby('Price').size(), columns=['Price Count']).reset_index()

display(pd.concat([rating_binned, Reviews_binned, Price_binned], axis=1))

Les années de publications :

In [None]:
print("Year:", '\n', amazon['Year'].value_counts())

sns.countplot('Year', data=amazon, palette='Set3')
plt.show()

Le dataset ne montre pas de défauts, il y a bien 50 livres par année dans ce dataset, ce qui le rend très homogène.

# **Analyse des données :**

Performance du "Genre" par rapport aux "User Rating" :

In [None]:
fig, ax = plt.subplots(1,2, figsize=(14,5))

rating = amazon.groupby('Year')['User Rating'].mean()
rating = rating.reset_index()

sns.lineplot(x='Year', y='User Rating', data=amazon, ci=None, markers=True, ax=ax[0])
ax[0].set_xticks(ticks=amazon['Year'].value_counts(ascending=True).index)

sns.lineplot(x='Year', y='User Rating', hue='Genre', data=amazon, ci=None, markers=True, ax=ax[1])
ax[1].set_xticks(ticks=amazon['Year'].value_counts(ascending=True).index)

plt.show()

time = pd.DataFrame(amazon.groupby('Year')['User Rating'].mean())
time_genre = pd.DataFrame(amazon.groupby(['Genre', 'Year'])['User Rating'].mean())

time['Rating Fiction'] = list(np.around(time_genre.loc['Fiction'].reset_index()['User Rating'], 3))
time['Rating Non Fiction'] = list(np.around(time_genre.loc['Non Fiction'].reset_index()['User Rating'], 3))
display(time)

Analyse des prix :

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20,5))

price = amazon.groupby('Year')['Price'].mean()
price = price.reset_index()

sns.lineplot(x='Year', y='Price', data=amazon, ci=None, markers=True, ax=ax[0])
ax[0].set_xticks(ticks=amazon['Year'].value_counts(ascending=True).index)

sns.lineplot(x='Year', y='Price', data=amazon, ci=None, markers=True, ax=ax[1], hue='Genre')
ax[1].set_xticks(ticks=amazon['Year'].value_counts(ascending=True).index)

plt.show()

time = pd.DataFrame(amazon.groupby('Year')['Price'].mean())
time_genre = pd.DataFrame(amazon.groupby(['Genre', 'Year'])['Price'].mean())

time['Price Fiction'] = list(np.around(time_genre.loc['Fiction'].reset_index()['Price'], 3))
time['Price Non Fiction'] = list(np.around(time_genre.loc['Non Fiction'].reset_index()['Price'], 3))
display(time)

Corrélation entre le "Prix" et les "User Rating" :

In [None]:
fig, ax = plt.subplots(figsize=(7,5))

ax.set_title('Average Price and Average User Rating')

ax.plot(amazon.groupby('Year')['User Rating'].mean())
ax.tick_params('y', colors='blue')
ax.set_ylabel('User Rating', color='blue')

ax2 = ax.twinx()
ax2.plot(amazon.groupby('Year')['Price'].mean(), color='darkorange')
ax2.tick_params('y', colors='darkorange')
ax2.set_ylabel('Price', color='darkorange')

ax.set_xticks(ticks=amazon['Year'].value_counts(ascending=True).index)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,5))

fiction = amazon[amazon['Genre']=='Fiction']

Nonfiction = amazon[amazon['Genre']=='Non Fiction']


ax[0].set_title('Average Price and Average User Rating (Fiction)')

ax[0].plot(fiction.groupby('Year')['User Rating'].mean())
ax[0].tick_params('y', colors='blue')
ax[0].set_ylabel('User Rating', color='blue')

ax2 = ax[0].twinx()
ax2.plot(fiction.groupby('Year')['Price'].mean(), color='darkorange')
ax2.tick_params('y', colors='darkorange')
ax2.set_ylabel('Price', color='darkorange')

ax[0].set_xticks(ticks=amazon['Year'].value_counts(ascending=True).index)


ax[1].set_title('Average Price and Average User Rating (Non Fiction)')

ax[1].plot(Nonfiction.groupby('Year')['User Rating'].mean())
ax[1].tick_params('y', colors='blue')
ax[1].set_ylabel('User Rating', color='blue')

ax3 = ax[1].twinx()
ax3.plot(Nonfiction.groupby('Year')['Price'].mean(), color='darkorange')
ax3.tick_params('y', colors='darkorange')
ax3.set_ylabel('Price', color='darkorange')

ax[1].set_xticks(ticks=amazon['Year'].value_counts(ascending=True).index)


plt.show()

In [None]:
sns.heatmap(amazon.corr(), vmin=-1, vmax=1, cmap=sns.diverging_palette(20, 220, as_cmap=True), annot=True)
plt.show()

# Quel livre a le meilleur ratio User Rating/Price ?

In [None]:
amazon['Price/Rating'] = amazon['Price'] / amazon['User Rating']
amazon.sort_values('Price/Rating').head(10)

On remarque qu'il existe des livres gratuits dans la liste. On va les retirer et les analyser séparemment afin qu'ils ne faussent pas les résultats. Ici, on se concentre sur les livres payants.

In [None]:
amazon[amazon['Price']!=0].sort_values('Price/Rating').head(10)

# Quels sont les livres gratuits avec les meilleurs User Rating ?

In [None]:
amazon[amazon['Price']==0].sort_values('User Rating', ascending=False).head(10)

# Relation entre le nombre de reviews et le Rating des livres :

In [None]:
amazon['Reviews/Rating'] = amazon['Reviews']/amazon['User Rating']
amazon.sort_values('Reviews/Rating', ascending=False)