In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Exploratory Data Analysis
1. Remove null values
2. Genre counting
3. Distribution of Genres corresponding to the year
4. Author having max and min Genres
5. Reviews comparisons among Genres
6. Genres types max and min price
7. Book having maximum and minimum rating.
8. Ratings distribution
9. Price distribution
10. Review Count distribution

**Import necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
csv_path = "/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv"
dataset = pd.read_csv(csv_path)
dataset.head()

**1. Check null values**

In [None]:
dataset.isna().sum()

From above, we see there is no null values

**2. Check the maximum Generes**

In [None]:
sns.countplot(x='Genre', data=dataset)

We can conclude from above that there are more numbers of Non Fiction books. 

**3. Which year has maximum Fiction and Non-Fiction books**

In [None]:
sns.boxplot('Genre', 'Year', data=dataset)

In [None]:
dataset[dataset['Genre'] == 'Non Fiction'].shape

In [None]:
dataset[dataset['Genre'] == 'Fiction'].shape

From above analysis, we can say analyse:
1. There are more number of "Non Fiction"(310) than "Fiction"(240).
2. The Non-Fiction and Fiction are uniformly distributed with no outliers
3. Both are having median value in the year 2014

**4. Author having max and min Genres categories**

In [None]:
dataset.head()

In [None]:
len(np.unique(dataset['Author'])) ## Total number of Authors are 248

In [None]:
plt.figure(figsize=(10,50))
sns.countplot(y = "Author", hue="Genre", data=dataset) ## As there are lots of authors so graph is not clear.

In [None]:
## Let us print for first three authors

first_five_dataset = dataset.head(3)

sns.countplot(x = "Author", hue="Genre", data=first_five_dataset)

In [None]:
fiction_data = dataset[dataset['Genre'] == 'Fiction']
non_fiction_data = dataset[dataset['Genre'] == 'Non Fiction']

In [None]:
fiction_data.groupby('Author')['Genre'].count().sort_values(ascending=False) # Jeff Kinney is having max fiction books

In [None]:
data = fiction_data.groupby('Author')['Genre'].count().sort_values(ascending=True)
data[data == 1]  # There are many authors with 1 book

In [None]:
non_fiction_data.groupby('Author')['Genre'].count().sort_values(ascending=False) # Gary Chapman is having max Non Fiction books

In [None]:
data = non_fiction_data.groupby('Author')['Genre'].count().sort_values(ascending=True)
data[data == 1]  # There are many authors with 1 book

So we conclude in Task 4, that Jeff Kinney author is famous for Fiction books and Gary Chapman is for 
Non fiction data.

### 5. Reviews comparisons among Genres

In [None]:
dataset.head()

In [None]:
reviews_genres_data = dataset[['Reviews', 'Genre']]
reviews_genres_data

In [None]:
reviews_genres_data.groupby('Genre')['Reviews'].sum()

In [None]:
sns.boxplot(x='Genre', y='Reviews', data=reviews_genres_data)

From above, we can conclude that Fiction is having total 3764110 reviews where as Non Fiction is having 2810195 reviews. The max fiction review is near to 40000, and max non fiction review is near to 25000.

### 6. Genres types max and min price

In [None]:
genre_price_data = dataset[['Genre', 'Price']]
genre_price_data

In [None]:
genre_price_data.groupby('Genre')['Price'].max()

In [None]:
sns.boxplot(x='Genre', y= 'Price', data=genre_price_data)

From the above diagram, and analysis we can conclude that the Non-Fiction books is having more cost as compared to Fiction books.

### 7. Book having maximum and minimum rating.

In [None]:
dataset.columns

In [None]:
genre_rating_data = dataset[['Genre', 'User Rating']]
genre_rating_data

In [None]:
genre_rating_data.groupby('Genre')['User Rating'].max()

In [None]:
genre_rating_data.groupby('Genre')['User Rating'].min()

In [None]:
sns.boxplot(x='Genre', y='User Rating', data=genre_rating_data)

From the above analysis, we conclude that the max rating for both genre type is 4.9, whereas the Fiction type got the minimum rating of 3.3 and Non-Fiction 4.0. We can say people prefers Non-Fiction books.

### 8. Rating distribution

In [None]:
sns.distplot(dataset['User Rating'], bins=20)

From above plot, we can conclude that the data is left skewed

### 9 . Price Distribution

In [None]:
sns.distplot(dataset['Price'], bins=10)

From above plot, we can conclude that the data is right skewed.

### 10. Review Distribution

In [None]:
sns.distplot(dataset['Reviews'], bins=10)

Review distribution is also right skewed