In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans


In [None]:
df = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')

In [None]:
df

In [None]:
sns.set_style(style = 'whitegrid')
sns.heatmap(df.isnull(),cmap = 'rainbow')

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(y="Reviews", data=df,palette='rainbow')

In [None]:
df['Author'].nunique()

In [None]:
new_auth = df[(df['Price'] > df['Price'].mean())]['Author']
#Authors with book price > average price
plt.figure(figsize=(12,40))
sns.barplot(x='Price', y=new_auth,data=df)

In [None]:
new_auth = df[(df['Price'] > 20)]['Author']
#Authors with price > 20
plt.figure(figsize=(12,20))
sns.barplot(x='Price', y=new_auth,data=df,hue='Genre',palette='coolwarm')               

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df['Reviews'],bins=100,color = 'red')

In [None]:
sns.jointplot(y='User Rating',x='Reviews',data=df,kind='scatter',hue='Genre')

In [None]:
sns.pairplot(df,hue='Genre',palette='viridis')

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),cmap='coolwarm')

In [None]:
df.drop('Year',axis=1).describe()

In [None]:
sns.set_style('darkgrid')
g = sns.FacetGrid(df,hue="Genre",palette='coolwarm',size=6,aspect=2)
g = g.map(plt.hist,'User Rating',bins=20,alpha=0.7)

In [None]:
sns.set_style('darkgrid')
g = sns.FacetGrid(df,hue="Genre",palette='coolwarm',size=6,aspect=2)
g = g.map(plt.hist,'Reviews',bins=20,alpha=0.7)

**Encoding Categorical Data**

In [None]:
le = LabelEncoder()
df['Genre'] = le.fit_transform(df['Genre'])

In [None]:
df.info()

In [None]:
X = df.drop(['Name','Author'],axis=1).values

In [None]:
X

# K means Clustering

In [None]:
#Elbow methodt To find Optimal number of clusters
w = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 100)
    kmeans.fit(X)
    w.append(kmeans.inertia_)
plt.plot(range(1, 11), w)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
#choosing number of clusters as 4
kmeans = KMeans(n_clusters=4,init = 'k-means++', random_state = 100)
y = kmeans.fit_predict(X)

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.labels_

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(14,6))
ax1.set_title('K Means Review/Rating')
ax1.scatter(df['Reviews'],df['User Rating'],c=kmeans.labels_,cmap='rainbow')
ax2.set_title("K Means Price/Rating ")
ax2.scatter(df['Price'],df['User Rating'],c=kmeans.labels_,cmap ='rainbow')

In [None]:
y

**Visualizing Clusters**

In [None]:
plt.figure(facecolor='cyan',figsize=(10,6))
sns.set_style('white')
plt.scatter(X[y==0, 0], X[y==0, 1], s = 10, c = 'yellow',label = 'Cluster 1')
plt.scatter(X[y==1, 0], X[y==1, 1], s = 10, c = 'red', label = 'Cluster 2')
plt.scatter(X[y==2, 0], X[y==2, 1], s = 10, c = 'blue', label = 'Cluster 3')
plt.scatter(X[y==3, 0], X[y==3, 1], s = 10, c = 'green', label = 'Cluster 4')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 50, c = 'black', label = 'Centroids')
plt.title('Clusters')
plt.xlabel('Ratings')
plt.legend(loc='best', bbox_to_anchor=(1,0.5,0.25,0.25))
plt.show()