In [None]:
!pip install plotly
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.cluster import KMeans, AgglomerativeClustering
import plotly.express as px

In [None]:
df = pd.read_csv('wine-clustering.csv')
df

### Data Understanding/Visualization

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
sns.pairplot(df)

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.gcf().set_size_inches(9, 9)

In [None]:
df.corr()

In [None]:
sns.scatterplot(x='Total_Phenols', y='Flavanoids',hue='Hue', data=df)

In [None]:
sns.scatterplot(x='Proline', y='Alcohol',hue="Hue", data=df)

In [None]:
sns.scatterplot(x='OD280', y='Flavanoids', hue="Hue",data=df)

### Pre-processing the data

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
# standardize values
x = df.values
x = StandardScaler().fit_transform(x)

In [None]:
#Create (2D) PCA for the data
pca = PCA(n_components = 2)
pca_mdl = pca.fit_transform(x)
pca_df = pd.DataFrame(pca_mdl)
pca_df

### K-Means Clustering

In [None]:
inertia = []
for k in range(1,8):
    kmeans = KMeans(n_clusters=k, random_state=1).fit(x)
    inertia.append(np.sqrt(kmeans.inertia_))

In [None]:
plt.plot(range(1, 8), inertia, marker='s');
plt.xlabel('$k$')
plt.ylabel('Variance')

In [None]:
kmeans = KMeans(n_clusters=2, random_state=1).fit(x)

In [None]:
y = kmeans.fit_predict(x)

In [None]:
y_df = pd.DataFrame(y, columns=['Cluster'])
new_df = pd.concat([df, y_df], axis=1)
new_df

In [None]:
pca_df

In [None]:
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=y)

In [None]:
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=df['Alcohol'])

In [None]:
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=df['Proanthocyanins'])

In [None]:
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=df['Nonflavanoid_Phenols'])

In [None]:
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=df['Flavanoids'])

In [None]:
fig = px.scatter(pd.concat([new_df, pca_df], axis = 1), 
                 x = 0, y = 1, color='Cluster', hover_data=['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium',
       'Total_Phenols', 'Flavanoids', 'Nonflavanoid_Phenols',
       'Proanthocyanins', 'Color_Intensity', 'Hue', 'OD280', 'Proline'])
fig.show()