https://archive.ics.uci.edu/dataset/109/wine

# Importing Libraries and Dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from scipy.cluster.hierarchy import dendrogram, linkage

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/wine-dataset-for-clustering/wine-clustering.csv')
df

In [None]:
# Check null values
df.isna().sum()[df.isna().sum()>0]

In [None]:
sc = StandardScaler()
df_scaled = pd.DataFrame(sc.fit_transform(df), columns=df.columns)

# KMeans

### Calculating WCSS values for various k number of clusters

In [None]:
wcss = list()
for k in range(2,8):
    kmeans = KMeans(n_clusters=k, random_state=2, )
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

wcss

### Line Plot of wcss vs no of clusters (wcss plot)

In [None]:
sns.lineplot(y=wcss, x=range(2,8));

In [None]:
kmeansviz = KMeans(random_state=2)
visualizer = KElbowVisualizer(kmeansviz, k=(2,8))
visualizer.fit(df_scaled)
visualizer.show();

## Elbow Plot indicates that 3 number of clusters are optimum for the given data

### Lets check Silhouette score as well to find best number of clusters

In [None]:
# Get the predicted labels for each data point
# kmeans_sil is already fitted when we fitted SilhouetteVisualizer
for i in range(2,8):
    kmeans_sil_score = KMeans(n_clusters=i)
    kmeans_sil_score.fit(df_scaled)
    labels = kmeans_sil_score.labels_

# Calculate the silhouette score
    silhouette_avg = silhouette_score(df_scaled, labels)
    print(f"Clusters: {i}    Silhouette Score: {silhouette_avg}")

### Silhouette Score also indicates that the optimum number of clusters are 3
### Lets use Silhouette visualizer to check clustering for any wrongly classified data for [2,4] clusters

In [None]:
kmeans_sil = KMeans(n_clusters=3,random_state=2)
visualizer = SilhouetteVisualizer(kmeans_sil)
visualizer.fit(df_scaled)
visualizer.show();

# KMeans k=3

In [None]:
kmeans = KMeans(n_clusters=3, random_state=2)
kmeans.fit(df_scaled)
df_labeled = pd.concat([df_scaled,pd.Series(kmeans.labels_, name='Labels')],axis=1)
df_labeled

In [None]:
print('Silhouette Score with 3 clusters using KMeans:',silhouette_score(df_scaled, kmeans.labels_))

In [None]:
centroids = kmeans.cluster_centers_
centroids

In [None]:
# visualize clusters
sns.scatterplot(x = df_labeled.Alcohol, y = df_labeled.Malic_Acid, hue = pd.Series(kmeans.labels_),
               palette=['red','green','blue'])
for i in range(3):
    plt.plot(centroids[i][0], centroids[i][1], color='black', marker = '^', ms = 20)

In [None]:
df_labeled.groupby(by='Labels').mean()

In [None]:
sns.pairplot(df_labeled, hue='Labels', palette=['red','green','blue'])

# AgglomerativeClustering

In [None]:
linkage_matrix = linkage(df_scaled, method='ward')

In [None]:
plt.title('Linkage : Wards')
dendrogram(linkage_matrix);

In [None]:
ac = AgglomerativeClustering(n_clusters=3)
ac.fit(df_scaled)
labels_ac = ac.labels_

# Calculate the silhouette score
silhouette_avg = silhouette_score(df_scaled, labels_ac)
print(f"Silhouette Score: {silhouette_avg}")

### Since Silhouette Score for KMeans Clustering is better, it will be advisable to use the same

# PCA

In [None]:
pca = PCA(n_components=0.99) # when integer value is specified it considers the number of dimensions to be shown
# when float value is passed, it considers it as explained variance ratio
df_pca = pca.fit_transform(df_scaled)

In [None]:
# Eigen Value
print(pca.explained_variance_)
sum(pca.explained_variance_)

In [None]:
print(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
# Eigen Vector
# The eigenvectors are the directions (or axes) along which the data varies the most
# the corresponding eigenvalues indicate the magnitude of variance in each of these directions.
pca.components_

In [None]:
df_pca