# **Import Libraries**

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import scipy

# **Data Preprocessing**

In [None]:
df = pd.read_csv('../input/amazon-best-seller-june-2021-products/Amazon_Best_Seller_2021_June.csv')
df

In [None]:
df.nunique

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Good no null values.

In [None]:
df = df.drop(['Product Link'], axis=1)

In [None]:
df

In [None]:
df['Reviews Count'] = df['Reviews Count'].replace(',','', regex=True)

In [None]:
df

In [None]:
df['Price'] = df['Price'].str.replace('$','', regex=True)

In [None]:
df

In [None]:
df['Rank'] = df['Rank'].str.replace('#','', regex=True)
df

In [None]:
df['No of Sellers'] = df['No of Sellers'].str.replace(' Sellers','', regex=True)
df

In [None]:
df['No of Sellers'] = df['No of Sellers'].astype(int)
df['Rank'] = df['Rank'].astype(float)
df['Reviews Count'] = df['Reviews Count'].astype(int)
df['Price'] = df['Price'].astype(float)

In [None]:
df['Category'].unique()

In [None]:
category_dict = {'Electronics':0, 'Clothing, Shoes & Jewelry':1, 'Gift Cards':2, 'Books':3,
       'Video Games':4, 'Camera & Photo':5, 'Toys & Games':6}
df['Category'] = df['Category'].map(category_dict)

In [None]:
df

# **Data Visualizations**

# **Univariate Analysis**

In [None]:
df['Rating'].value_counts()

In [None]:
sns.histplot(df['Rating'])

In [None]:
df['Rating'].value_counts().plot.pie()
plt.show()

In [None]:
sns.boxplot(x=df['Rating'])

As expected, the higher ratings are more frequent in the dataset, though 4.9 and 5.0 is less frequent than expected but this is most likely due to the scarcity of these ratings. 

In [None]:
df['No of Sellers'].value_counts()

In [None]:
sns.histplot(df['No of Sellers'])

In [None]:
df['No of Sellers'].value_counts().plot.pie()
plt.show()

In [None]:
sns.boxplot(x=df['No of Sellers'])

Very interesting to see that products sold by 1 seller makes up the majority of the dataset. 

In [None]:
df['Price'].value_counts()

In [None]:
sns.histplot(df['Price'])

In [None]:
df['Price'].value_counts().plot.pie()
plt.show()

In [None]:
sns.boxplot(x=df['Price'])

As expected, the cheaper products are more frequent in the dataset. 

In [None]:
df['Category'].value_counts()

In [None]:
print(category_dict)
sns.histplot(df['Category'])

In [None]:
print(category_dict)
df['Category'].value_counts().plot.pie()
plt.show()

As expected, the dataset is mostly balanced in terms of categories

# **Bivariate Analysis**

In [None]:
sns.pairplot(data=df)

No interesting trends. 

We are going to analyze the relation of each data point to rank and category.

In [None]:
sns.boxplot(x=df['Rating'], y=df['Category'])

In [None]:
df.groupby('Category')['Rating'].describe()

In [None]:
sns.boxplot(x=df['Reviews Count'], y=df['Category'])

In [None]:
df.groupby('Category')['Reviews Count'].describe()

In [None]:
sns.boxplot(x=df['Price'], y=df['Category'])

In [None]:
df.groupby('Category')['Price'].describe()

In [None]:
sns.boxplot(x=df['No of Sellers'], y=df['Category'])

In [None]:
df.groupby('Category')['No of Sellers'].describe()

In [None]:
print(category_dict)

Some interesting trends we see here are that the video games category has twice as many sellers for each product than any other category along with twice the standard deviation, video games have the highest standard deviation for ratings, gift cards bought are around $50, and gift cards have more than 4 times the count of reviews compared to any other category and have the highest standard deviation. 

In [None]:
sns.boxplot(x=df['No of Sellers'], y=df['Rank'])

In [None]:
df.groupby('Rank')['No of Sellers'].describe()

In [None]:
sns.boxplot(x=df['Rating'], y=df['Rank'])

In [None]:
df.groupby('Rank')['Rating'].describe()

In [None]:
sns.boxplot(x=df['Reviews Count'], y=df['Rank'])

In [None]:
df.groupby('Rank')['Reviews Count'].describe()

In [None]:
sns.boxplot(x=df['Price'], y=df['Rank'])

In [None]:
df.groupby('Rank')['Price'].describe()

# **Multivariate Analysis**

In [None]:
plt.figure(figsize=(20,10))
c= df.corr()
sns.heatmap(c,cmap='Blues',annot=True)
c

Not too much coorelation in the dataset. The highest coorelation for rank is price and reviews count. The highest coorelation for category is price, no of sellers, and reviews count.

In [None]:
sns.lmplot(x='Price', y='Reviews Count', hue='Rank', 
           data=df.loc[df['Rank'].isin([1, 2, 3, 4, 5, 95, 96, 97, 98, 99])], 
           fit_reg=False)

In [None]:
sns.lmplot(x='Price', y='Reviews Count', hue='Category', 
           data=df, 
           fit_reg=False)

In [None]:
sns.lmplot(x='Price', y='No of Sellers', hue='Category', 
           data=df, 
           fit_reg=False)

In [None]:
sns.lmplot(x='Reviews Count', y='No of Sellers', hue='Category', 
           data=df, 
           fit_reg=False)

Thanks for reading through this notebook! If you learned something, remember to upvote! :)