## Introduction

In this notebook we will analyzing, cleaning and visulizing the data.

## Import a relevant libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os

from plotly import express as px, graph_objects as go
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score

## Import data

In [None]:
books_df = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
books_df.head()

In [None]:
print('shape of dataframe: {}'.format(books_df.shape))
print('number of authors: {}'.format(len(books_df.Author.unique())))

In [None]:
books_df.info()

## Missing data

In [None]:
books_df.isnull().sum()

## Data Visualization

Number of book genres by year

In [None]:
genre_year_df = books_df[['Year', 'Genre']]
genre_year_df = pd.get_dummies(genre_year_df).groupby('Year').sum()
genre_year_fig = go.Figure(data=[
    go.Bar(name='Fiction', x=genre_year_df.index, y=genre_year_df.Genre_Fiction),
    go.Bar(name='Non Fiction', x=genre_year_df.index, y=genre_year_df['Genre_Non Fiction'])
])
genre_year_fig.update_layout(barmode='stack', xaxis_title_text='year', yaxis_title_text='count', title='Number of book genres by year')

Author by number of books

In [None]:
author_books_num = books_df.Author.value_counts(ascending=True)[-10:]
px.bar(x=author_books_num ,
        y=author_books_num .index,
        color=author_books_num ,
        orientation='h',
        width=800,
        title='Author by number of books')

books by number of Genre

In [None]:
genre_num = books_df.Genre.value_counts()
pie_genre = go.Figure(data=[
    go.Pie(labels=genre_num.index, values=genre_num, pull=[0, 0.05])
])
pie_genre.update_layout(width=800, height=400, title='books by number of Genre')
pie_genre.show()

Author by mean of price

In [None]:
author_mean_p = books_df.groupby('Author').Price.mean().sort_values()[-10:]
px.bar(x=author_mean_p,
      y=author_mean_p.index,
      color=author_mean_p,
      width=800,
      orientation='h',
      title='Author by mean of price')

books by number of reviews

In [None]:
# we have a different year of publishing here, so we addup the reviews number
books_reviews_num = books_df.groupby('Author').Reviews.sum().sort_values(ascending=True)[-10:]
px.bar(x=books_reviews_num,
        y=books_reviews_num.index,
        color=books_reviews_num,
        orientation='h',
        title='books by number of reviews')

mean book price by year

In [None]:
book_price_mean = books_df.groupby('Year').Price.mean().sort_values(ascending=True)[-10:]
book_price_mean.index = book_price_mean.index.astype(str) + ' year'
px.bar(x=book_price_mean,
      y=book_price_mean.index,
      color=book_price_mean,
      orientation='h',
      title='mean book price by year')

How satisfied the users were with the books

In [None]:
def return_pull(value):
    return [0.05 if v==value.max() else 0  for v in value]

auth_rate_pie = books_df['User Rating'].value_counts()
pie = go.Figure(data=[
    go.Pie(labels=auth_rate_pie.index, values=auth_rate_pie, pull=return_pull(auth_rate_pie))
])
pie.update_layout(width=800, title='How satisfied the users were with the books')

number of books republished

In [None]:
books_republished = books_df.Name.value_counts(ascending=True)[-10:]
px.bar(x=books_republished,
      y=books_republished.index,
      color=books_republished,
      orientation='h',
      title='number of books republished')