In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import os
import warnings
warnings.filterwarnings('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.head()

In [None]:
df.info()

# Ratio fiction and non fiction books

In [None]:
# group by genre
group_genre = df.groupby('Genre')['Name'].count().reset_index()
group_genre.rename(columns={'Name':'Count'}, inplace=True)
group_genre['Percentage'] = round(group_genre['Count']/sum(group_genre['Count'])*100,2)

# fig
fig = plt.figure(figsize=(12,4))

# axes
axes1 = fig.add_axes([0,0,1,1])

# barh
axes1.barh(width=group_genre['Percentage'][0]+group_genre['Percentage'][1], y=0, color='MidnightBlue')
axes1.barh(width=group_genre['Percentage'][0], y=0, color='LightSteelBlue')

# percentages
axes1.text(group_genre['Percentage'][0]/2.9, 0, f"{group_genre['Percentage'][0]}%", color='Goldenrod', fontsize=30, fontweight='bold')
axes1.text((group_genre['Percentage'][0]+group_genre['Percentage'][1])/1.57, 0, f"{group_genre['Percentage'][1]}%", color='Goldenrod', fontsize=30, fontweight='bold')

# count
axes1.text(120, 0.3, 'Fiction books:', fontsize=30, fontweight='bold', color='LightSteelBlue')
axes1.text(120, 0.2, f"{group_genre['Count'][0]}", fontsize=30, fontweight='bold', color='LightSteelBlue')
axes1.text(120, 0, 'Non Fiction books:', fontsize=30, fontweight='bold', color='MidnightBlue')
axes1.text(120, -0.1, f"{group_genre['Count'][1]}", fontsize=30, fontweight='bold', color='MidnightBlue')

# title
axes1.text(group_genre['Percentage'][0]/2.9, 0.5, f"{group_genre['Genre'][0]}", color='Black', fontsize=30, fontweight='bold')
axes1.text((group_genre['Percentage'][0]+group_genre['Percentage'][1])/1.65, 0.5, f"{group_genre['Genre'][1]}", color='Black', fontsize=30, fontweight='bold')

# axis
axes1.axis('off')
fig.show()

In [None]:
# fig
fig = plt.figure(figsize=(12,6))

# axes
axes = fig.add_axes([0,0,1,1])

# countplot
sns.countplot(x=df['Year'], hue=df['Genre'], palette=['MidnightBlue','LightSteelBlue', ], ax=axes)

# spines
axes.spines[['right', 'top', 'left', 'bottom']].set_visible(False)

# ticks
axes.set_yticks([])

# legend
axes.get_legend().remove()

# bar's annotate
for p in axes.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    axes.annotate('{:.0f}'.format(height), (x + width/2, y + height*1.02), ha='center')

# lables
axes.set_xlabel('Years', fontsize=16, color='black')
axes.set_ylabel('')

# title
axes.text(1.3, 35, 'Number of fiction and non fiction books by year', color='Black', fontsize=24, fontweight='bold', label='Fiction books')

# conclusion
axes.text(12, 25, 'Maximum books was in:', fontsize=16, color='black', fontweight='bold')
axes.text(12, 23.5, '2014;', fontsize=14, color='LightSteelBlue', fontweight='bold')
axes.text(12, 22, '2015.', fontsize=14, color='MidnightBlue', fontweight='bold')
axes.text(12, 20.5, 'Minimum books was in:', fontsize=16, color='black', fontweight='bold')
axes.text(12, 19, '2015;', fontsize=14, color='LightSteelBlue', fontweight='bold')
axes.text(12, 17.5, '2014.', fontsize=14, color='MidnightBlue', fontweight='bold')
fig.show()

# Price by genre

In [None]:
# fig
fig = plt.figure(figsize=(12, 4))

# axes
axes1 = fig.add_axes([0,0,1,1])
axes2 = fig.add_axes([0,-1.3,1,1])

# kde
sns.kdeplot(df[df['Genre']=='Fiction']['Price'], color='LightSteelBlue', fill=True, shade=True, alpha=0.9, ax=axes1)
sns.kdeplot(df[df['Genre']=='Non Fiction']['Price'], color='MidnightBlue', fill=True, shade=True, alpha=0.9, ax=axes2)

# scatter
axes1.scatter(x=[df[df['Genre']=='Fiction']['Price'].mean(), df[df['Genre']=='Fiction']['Price'].median(),
                df[df['Genre']=='Fiction']['Price'].max(), df[df['Genre']=='Fiction']['Price'].min()],
             y=[0.072, 0.072, 0.072, 0.072], color='Goldenrod', lw=3)
axes2.scatter(x=[df[df['Genre']=='Non Fiction']['Price'].mean(), df[df['Genre']=='Non Fiction']['Price'].median(),
                df[df['Genre']=='Non Fiction']['Price'].max(), df[df['Genre']=='Non Fiction']['Price'].min()],
             y=[0.053, 0.053, 0.053, 0.053], color='Goldenrod', lw=3)

# mean
axes1.axvline(df[df['Genre']=='Fiction']['Price'].mean(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes1.text(df[df['Genre']=='Fiction']['Price'].mean(), 0.075, 'mean', color = 'black', fontsize=14,  fontweight='bold')
axes2.axvline(df[df['Genre']=='Non Fiction']['Price'].mean(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes2.text(df[df['Genre']=='Non Fiction']['Price'].mean(), 0.056, 'mean', color = 'black', fontsize=14,  fontweight='bold')


# median
axes1.axvline(df[df['Genre']=='Fiction']['Price'].median(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes1.text(df[df['Genre']=='Fiction']['Price'].median()-6, 0.075, 'median', color = 'black', fontsize=14,  fontweight='bold')
axes2.axvline(df[df['Genre']=='Non Fiction']['Price'].median(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes2.text(df[df['Genre']=='Non Fiction']['Price'].median()-8, 0.056, 'median', color = 'black', fontsize=14,  fontweight='bold')

# min
axes1.axvline(df[df['Genre']=='Fiction']['Price'].min(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes1.text(df[df['Genre']=='Fiction']['Price'].min()-3, 0.075, 'min', color = 'black', fontsize=14,  fontweight='bold')
axes2.axvline(df[df['Genre']=='Non Fiction']['Price'].min(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes2.text(df[df['Genre']=='Non Fiction']['Price'].min()-3.2, 0.056, 'min', color = 'black', fontsize=14,  fontweight='bold')

# max
axes1.axvline(df[df['Genre']=='Fiction']['Price'].max(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes1.text(df[df['Genre']=='Fiction']['Price'].max()-0.8, 0.075, 'max', color = 'black', fontsize=14,  fontweight='bold')
axes2.axvline(df[df['Genre']=='Non Fiction']['Price'].max(), linestyle = ':', color = 'Goldenrod', ymin = 0, ymax = 0.9, lw=4)
axes2.text(df[df['Genre']=='Non Fiction']['Price'].max()-0.8, 0.056, 'max', color = 'black', fontsize=14,  fontweight='bold')

# spines
axes1.spines[['right', 'top', 'left', 'bottom']].set_visible(False)
axes2.spines[['right', 'top', 'left', 'bottom']].set_visible(False)

# labels
axes1.set_xlabel('Price', fontsize=16, color='black')
axes2.set_xlabel('Price', fontsize=16, color='black')
axes1.set_ylabel('')
axes2.set_ylabel('')

# ticks
axes1.set_xticks(list(range(df[df['Genre']=='Fiction']['Price'].min(), df[df['Genre']=='Fiction']['Price'].max(),5)))
axes2.set_xticks(list(range(df[df['Genre']=='Non Fiction']['Price'].min(), df[df['Genre']=='Non Fiction']['Price'].max(),5)))

# title
axes1.text(30, 0.075, 'Price of fiction books', fontsize=24, color='black', fontweight='bold')
axes2.text(37, 0.056, 'Price of non fiction books', fontsize=24, color='black', fontweight='bold')

# conclusion
axes1.text(40, 0.055, 'The largest number of books in range: 3 - 15;', color='black', fontsize=14)
axes1.text(40, 0.05, f"Minimum price - {df[df['Genre']=='Fiction']['Price'].min()};", color='black', fontsize=14)
axes1.text(40, 0.045, f"Maximum price - {df[df['Genre']=='Fiction']['Price'].max()};", color='black', fontsize=14)
axes1.text(40, 0.04, f"Mean price - {df[df['Genre']=='Fiction']['Price'].mean()};", color='black', fontsize=14)
axes1.text(40, 0.035, f"Median - {df[df['Genre']=='Fiction']['Price'].median()}.", color='black', fontsize=14)
axes2.text(50, 0.041, 'The largest number of books in range: 5 - 20;', color='black', fontsize=14)
axes2.text(50, 0.037, f"Minimum price - {df[df['Genre']=='Non Fiction']['Price'].min()};", color='black', fontsize=14)
axes2.text(50, 0.033, f"Maximum price - {df[df['Genre']=='Non Fiction']['Price'].max()};", color='black', fontsize=14)
axes2.text(50, 0.029, f"Mean price - {round(df[df['Genre']=='Non Fiction']['Price'].mean(),2)};", color='black', fontsize=14)
axes2.text(50, 0.025, f"Median - {df[df['Genre']=='Non Fiction']['Price'].median()}.", color='black', fontsize=14)

plt.show()

In [None]:
# create list with mean price for fiction and non fiction
mean_price_fiction_books = []
for year in range(2009,2020):
    mean = df[df['Genre']=='Fiction'][df[df['Genre']=='Fiction']['Year']==year]['Price'].mean()
    mean_price_fiction_books.append(mean)
mean_price_non_fiction_books = []
for year in range(2009,2020):
    mean = df[df['Genre']=='Non Fiction'][df[df['Genre']=='Non Fiction']['Year']==year]['Price'].mean()
    mean_price_non_fiction_books.append(mean)
    
# create list with years
years = range(2009,2020, 1)

# create df with years and prices 
mean_price_fiction_books_df = pd.DataFrame({'Year':years, 'Mean_price':mean_price_fiction_books})
mean_price_non_fiction_books_df = pd.DataFrame({'Year':years, 'Mean_price':mean_price_non_fiction_books})

# fig
fig = plt.figure(figsize=(12,6))

# axes
axes = fig.add_axes([0,0,1,1])

# for bar's width
x = np.arange(len(years))
width = 0.35

# bar
axes.bar(x - width/2, height=mean_price_fiction_books_df['Mean_price'], color='LightSteelBlue', width=width)
axes.bar(x + width/2, height=mean_price_non_fiction_books_df['Mean_price'], color='Midnightblue', width=width)

# ticks
axes.set_xticklabels(years)
axes.set_xticks(x)
axes.set_yticks([])

# lables
axes.set_xlabel('Years', fontsize=16, color='black')
axes.set_ylabel('')

# spines
axes.spines[['right', 'top', 'left', 'bottom']].set_visible(False)

# bar's annotate
for p in axes.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    axes.annotate('{:.1f}'.format(height), (x + width/2, y + height*1.02), ha='center', fontsize=12, color='black')
    
# title
axes.text(5.6, 21, 'Mean prices of fiction and', color='Black', fontsize=24, fontweight='bold', label='Fiction books')
axes.text(5.6, 19.5, 'non fiction books by year', color='Black', fontsize=24, fontweight='bold', label='Non fiction books')

# conclusion
axes.text(12.5, 18, 'Maximum mean price was in:', fontsize=16, color='black', fontweight='bold')
axes.text(12.5, 17, '2009;', fontsize=14, color='LightSteelBlue', fontweight='bold')
axes.text(12.5, 16, '2014.', fontsize=14, color='MidnightBlue', fontweight='bold')
axes.text(12.5, 14, 'Minimum mean price was in:', fontsize=16, color='black', fontweight='bold')
axes.text(12.5, 13, '2017 and 2018;', fontsize=14, color='LightSteelBlue', fontweight='bold')
axes.text(12.5, 12, '2019.', fontsize=14, color='MidnightBlue', fontweight='bold')

fig.show()

**We observe a strong variability of indicators. The increase and decrease mean price of fiction and non-fiction books does not always occur in the same periods.**

**Mean price of non-scientific books is higher in all years except 2009.**

In [None]:
# create list with max price for fiction and non fiction
max_price_fiction_books = []
for year in range(2009,2020):
    max = df[df['Genre']=='Fiction'][df[df['Genre']=='Fiction']['Year']==year]['Price'].max()
    max_price_fiction_books.append(max)
max_price_non_fiction_books = []
for year in range(2009,2020):
    max = df[df['Genre']=='Non Fiction'][df[df['Genre']=='Non Fiction']['Year']==year]['Price'].max()
    max_price_non_fiction_books.append(max)
    

# create df with years and prices 
max_price_fiction_books_df = pd.DataFrame({'Year':years, 'Max_price':max_price_fiction_books})
max_price_non_fiction_books_df = pd.DataFrame({'Year':years, 'Max_price':max_price_non_fiction_books})

# fig
fig = plt.figure(figsize=(12,6))

# axes
axes = fig.add_axes([0,0,1,1])

# for bar's width
x = np.arange(len(years))
width = 0.35

# bar
axes.bar(x - width/2, height=max_price_fiction_books_df['Max_price'], color='LightSteelBlue', width=width)
axes.bar(x + width/2, height=max_price_non_fiction_books_df['Max_price'], color='MidnightBlue', width=width)

# ticks
axes.set_xticklabels(years)
axes.set_xticks(x)
axes.set_yticks([])

# lables
axes.set_xlabel('Years', fontsize=16, color='black')
axes.set_ylabel('')

# spines
axes.spines[['right', 'top', 'left', 'bottom']].set_visible(False)

# bar's annotate
for p in axes.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    axes.annotate('{:.1f}'.format(height), (x + width/2, y + height*1.02), ha='center', fontsize=12, color='black')
    
# title
axes.text(5.6, 105, 'Max prices of fiction and', color='Black', fontsize=24, fontweight='bold')
axes.text(5.6, 98, 'non fiction books by year', color='Black', fontsize=24, fontweight='bold')

# conclusion
axes.text(12.5, 80, 'Maximum max price was in:', fontsize=16, color='black', fontweight='bold')
axes.text(12.5, 75, '2009;', fontsize=14, color='LightSteelBlue', fontweight='bold')
axes.text(12.5, 70, '2013 and 2014.', fontsize=14, color='MidnightBlue', fontweight='bold')
axes.text(12.5, 65, 'Minimum max price was in:', fontsize=16, color='black', fontweight='bold')
axes.text(12.5, 60, '2015;', fontsize=14, color='LightSteelBlue', fontweight='bold')
axes.text(12.5, 55, '2019.', fontsize=14, color='MidnightBlue', fontweight='bold')
fig.show()

**We also observe high variability in the indicators.**

**Max price of non fiction books is higher in all years except 2009 and 2019.**

**I want to know more about books with highest price.**

In [None]:
df[df['Genre']=='Fiction'][df[df['Genre']=='Fiction']['Price']==82]

**About author**: Stephenie Meyer graduated from Brigham Young University with a degree in English literature, and she lives with her husband and three sons in Arizona.

**About book**: Deeply romantic and extraordinarily suspenseful, Twilight, New Moon, Eclipse, and Breaking Dawn capture the struggle between defying our instincts and satisfying our desires.

**(c) Amazon**

In [None]:
df[df['Genre']=='Non Fiction'][df[df['Genre']=='Non Fiction']['Price']==105]

**About author**: The American Psychiatric Association (APA) is the main professional organization of psychiatrists and trainee psychiatrists in the United States, and the largest psychiatric organization in the world.

**(c) Wikipedia**

**About book**: This new edition of Diagnostic and Statistical Manual of Mental Disorders (DSM-5), used by clinicians and researchers to diagnose and classify mental disorders, is the product of more than 10 years of effort by hundreds of international experts in all aspects of mental health. Their dedication and hard work have yielded an authoritative volume that defines and classifies mental disorders in order to improve diagnoses, treatment, and research.

**(c) Amazon**

**And so, analysis shows that price non fiction books > price fiction books.** 

**But I want to know if genre is a statistically significant feature for price. We have a non-normal distribution of data, because I will use Mann-Whitney U test.**

**H0 - The differences are not statistically significant and are random.**

In [None]:
stat, p = stats.mannwhitneyu(df[df['Genre']=='Fiction']['Price'],df[df['Genre']=='Non Fiction']['Price'])
print('p=%.8f' % (p))
print()
alpha = 0.05
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')

**H0 rejected, and so we can say that genre is statistically significant feature for price. Let's visualize with boxplot price of fiction and non fiction books.**

In [None]:
# fig
fig = plt.figure(figsize=(14,6))

# axes
axes = fig.add_axes([0,0,1,1])

# boxplot
sns.boxplot(x=df['Price'], y=df['Genre'], palette=['MidnightBlue','LightSteelBlue', ], ax=axes)

# spines
axes.spines[['right', 'top', 'left', 'bottom']].set_visible(False)

# lables
axes.set_xlabel('Price', fontsize=16, color='black')
axes.set_ylabel('')

# title
axes.text(40, -0.5, 'Price by genre', color='Black', fontsize=24, fontweight='bold')

fig.show()

**Boxplot shows the differences between prices of fiction and non fiction books**

**And at the end about price I want to visualize the most expensive books by genre.**

In [None]:
# top 10
top_fiction = df[df['Genre']=='Fiction'].sort_values(by='Price', ascending=False).head()
top_fiction = top_fiction[['Author', 'Name', 'Price']]
top_fiction.reset_index(inplace=True)
top_fiction.drop('index', inplace=True, axis=1)
top_non_fiction = df[df['Genre']=='Non Fiction'].sort_values(by='Price', ascending=False).head()
top_non_fiction = top_non_fiction[['Author', 'Name', 'Price']]
top_non_fiction.reset_index(inplace=True)
top_non_fiction.drop('index', inplace=True, axis=1)


# create lists for tables
list_fiction=[]
for n in range(len(top_fiction['Name'])):
    x = top_fiction.loc[n,:]
    list_fiction.append(x)
list_non_fiction=[]
for n in range(len(top_non_fiction['Name'])):
    x = top_non_fiction.loc[n,:]
    list_non_fiction.append(x)
    
# color list
color_list=[['white', 'white', 'white']]

# fig
fig = plt.figure()

# axes
axes1 = fig.add_axes([0, 0, 1, 1]) 
axes2 = fig.add_axes([0, -1, 1, 1]) 

# tables
table1=axes1.table(cellColours=color_list*5,cellText = list_fiction, cellLoc ='left', loc ='upper left', colWidths=[0.5,1.4,0.2],
                  colLabels=top_fiction.columns,colColours=['LightSteelBlue']*3)           
table1.auto_set_font_size(False) 
table1.set_fontsize(16)  
table1.scale(1.5, 2.7) 
axes1.text(0.9, 1.1, 'Top 5 most expensive non fiction books', color='black', fontsize=20, fontweight='bold')

table2=axes2.table(cellColours=color_list*5,cellText = list_non_fiction, cellLoc ='left', loc ='upper left', colWidths=[0.5,1.4,0.2],
                  colLabels=top_non_fiction.columns,colColours=['MidnightBlue']*3)           
table2.auto_set_font_size(False) 
table2.set_fontsize(16)  
table2.scale(1.5, 2.7) 
axes2.text(0.9, 1.1, 'Top 5 most expensive non fiction books', color='black', fontsize=20, fontweight='bold')

# axis
axes1.set_axis_off()
axes2.set_axis_off()

fig.show()