In [None]:
# import all essential libraries
import numpy as np                      #for numerical manipulation
import pandas as pd                     #for data frame
import matplotlib.pyplot as plt         #for ploting
import seaborn as sns                   #for style and aesthetics of plots
import csv                              #to read csv files


In [None]:
#Import the data file and obtain info about it 
data = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')           #import csv data and put it into a dataframe
data.info()                                                     #checks to see if there are any null in any cell and gives data type
data.describe()                                                 #provides statistical information about the data

In [None]:
sns.set_style("dark")
plt.rcParams['font.size'] = 10
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['xtick.minor.width']=0.2
plt.subplot(1,2,1)
sns.histplot(data, x="User Rating", stat='count')         #histogram plot of 'user rating' using probability (could do 'count')

sns.set_style("white")
plt.rcParams['font.size'] = 10
plt.rcParams['figure.figsize'] = (9, 5)
plt.subplot(1,2,2)
sns.histplot(data, x="Price", stat="probability")               #histogram plot of 'Price' using probability (could do 'count')

sns.jointplot(data=data, x="User Rating", y="Price", kind="reg")

In [None]:
sns.displot(data=data,x="User Rating", kind= "ecdf")

In [None]:
sns.set_palette("pastel")
plt.title("Rate Change per Year")
sns.lineplot(data=data, x="Year", y="User Rating", linewidth=3, hue= "Genre")

In [None]:
sns.set_palette("pastel")
plt.title("Review Change per Year")
sns.lineplot(data=data, x="Year", y="Reviews", linewidth=3, hue= "Genre")

In [None]:
sns.set_palette("pastel")
plt.title("Price per Year")
sns.lineplot(data=data, x="Year", y="Price", linewidth=3, hue= "Genre")

In [None]:
sns.set_palette("pastel")
genre = data.Genre.value_counts()
plt.title("Genre Category")
plt.pie(x=genre,labels=genre.index, startangle=90, autopct='%0.1f%%')

In [None]:
#1 Top 10 bestselling books from 2009 to 2019 (fiction/non-fiction)
dataTop = data[data['Year'].between(2009,2019)]
NonFict = dataTop[dataTop['Genre']=='Non Fiction']              #categorize all non-fiction genre
Fict = dataTop[dataTop['Genre']=='Fiction']                     #categorize all fiction genre

Top10 = dataTop['Name'].value_counts().head(10)                 #Count book titles that were bought several times from hightest 
Top10NonFict = NonFict['Name'].value_counts().head(10)
Top10Fict = Fict['Name'].value_counts().head(10)

sns.barplot(x=Top10, y=Top10.index)
plt.title('Top 10 Bestselling Books Overall')
plt.xlabel('# Bestseller')

In [None]:
sns.barplot(x=Top10NonFict, y=Top10NonFict.index)
plt.title('Top 10 Bestselling Books (Non Fiction)')
plt.xlabel('# Bestseller')


In [None]:
sns.barplot(x=Top10Fict, y=Top10Fict.index)
plt.title('Top 10 Bestselling Books (Fiction)')
plt.xlabel('# Bestseller')

In [None]:
# 2. Top 10 bestselling authors from 2009 to 2019 (fiction/non-fiction)
TopAuthor = dataTop['Author'].value_counts().head(10)
sns.barplot(x=TopAuthor, y=TopAuthor.index)
plt.title('Top 10 Bestselling Author')
plt.xlabel('# of Books per Author')

In [None]:
TopAuthorFict = Fict['Author'].value_counts().head(10)
sns.barplot(x=TopAuthorFict, y=TopAuthorFict.index)
plt.title('Top 10 Bestselling Author (Fiction)')
plt.xlabel('# of Books per Author')

In [None]:
TopAuthorNonFict = NonFict['Author'].value_counts().head(10)
sns.barplot(x=TopAuthorNonFict, y=TopAuthorNonFict.index)
plt.title('Top 10 Bestselling Author (Non Fiction)')
plt.xlabel('# of Books per Author')

In [None]:
# 3. Year-wise percentage category distribution of books
Group = data.groupby(['Year', 'Genre'])[['Genre']].count()                      #Group the data by Year and Genre and count the # of occurance            
GroupPct = Group/Group.groupby(level=0).sum()*100                               #Create new df that calculates percentage of the Grouped values by Year and Genre
Group = Group.rename(columns={'Genre':'Count'})  
Group.reset_index(inplace=True)                    
GroupPct = GroupPct.rename(columns={'Genre':'Percentage'}).astype(int)          #Rename the column from 'count' to 'percentage' then convert float to integer
GroupPct.reset_index(inplace=True) 


sns.barplot(x=GroupPct.Year, y=GroupPct.Percentage, hue=GroupPct.Genre, edgecolor = 'black')
plt.title('Year-wise Book Distribution')

In [None]:
# 4. How many unique books and authors were included in bestsellers list from 2009 to 2019?
UniqueBook = dataTop.Name.unique().shape[0]
UniqueAuthor = dataTop.Author.unique().shape[0]
print('The number of Unique books is', UniqueBook)
print('The number of Unique books is', UniqueAuthor)

In [None]:
# 5. Most expensive book and most affordable book
MaxPrice = dataTop[dataTop['Price']==dataTop['Price'].max()]
MaxBook = MaxPrice.drop_duplicates(subset=['Name'], keep='first')
MaxBook=MaxBook[['Name','Author','Price']]
print('The most expensive book is ' + str(MaxBook.iloc[0,0]) + ' by ' + str(MaxBook.iloc[0,1]) + ' priced at ' + '$' +str(MaxBook.iloc[0,2]))


In [None]:
MinPrice = dataTop[dataTop['Price']==dataTop['Price'].min()]
MinBook = MinPrice.drop_duplicates(subset=['Name'], keep='first')
MinBook=MinBook[['Name','Author','Price']]
print('The least expensive book is', MinBook)

In [None]:
# 6. Highest rated and lowest rated books
MaxRate = dataTop[dataTop['User Rating']==dataTop['User Rating'].max()]
MaxRate = MaxRate.drop_duplicates(subset=['Name'], keep='first')
MaxRate=MaxRate[['Name','Author','Price']]

print(MaxRate)

In [None]:
MinRate = dataTop[dataTop['User Rating']==dataTop['User Rating'].min()]
MinRate = MinRate.drop_duplicates(subset=['Name'], keep='first')
MinRate=MinRate[['Name','Author','Price']]


print(MinRate)

In [None]:
# 7. Does the Title length of the book matter to be a bestseller?

#Count the lenght of each book titles
BookLenName = pd.DataFrame(columns=['Name', 'Len','Rate', 'Review'])
BookList = data.Name.tolist()
Len = []


for x in range(len(BookList)):
    BookLenName['Name'] = data['Name']
    Len.append(len(BookList[x]))
    BookLenName['Rate'] = data['User Rating']
    BookLenName['Review'] = data['Reviews']
    
BookLenName['Len'] = Len
BookLenName = BookLenName.drop_duplicates(subset=['Name'], keep='first')
GroupRate = BookLenName.groupby(by=['Rate','Len']).count()
GroupRate.reset_index(inplace=True)  

sns.relplot(data=BookLenName, x='Len', y='Rate')
plt.title('Title Length Vs. Rating')

In [None]:
sns.relplot(data=BookLenName, x='Len', y='Review')
plt.title('Title Length Vs. Review')

In [None]:
sns.barplot(data=BookLenName,x='Rate',y='Len')
plt.title('Book Length Vs. Rating Count')