In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import silhouette_score, silhouette_samples, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("../input/wine-pca/Wine.csv")

In [None]:
df.head()

In [None]:
df.Customer_Segment.unique()

In [None]:
df.drop(labels = "Customer_Segment", axis = 1, inplace = True)

In [None]:
df.isnull().any().sum()

In [None]:
df.shape

In [None]:
fig, ax = plt.subplots(5,3, figsize=(14,12))
axes_ = [axes_row for axes in ax for axes_row in axes]
for i,c in enumerate(df.columns):
    sns.distplot(df[c], ax = axes_[i], color = 'orange')
    plt.tight_layout()

In [None]:
fig, ax = plt.subplots(5,3, figsize=(14,12))
axes_ = [axes_row for axes in ax for axes_row in axes]
for i,c in enumerate(df.columns):
    sns.boxplot(df[c], ax = axes_[i], color = 'skyblue')
    plt.tight_layout()

In [None]:
fig, ax = plt.subplots(5,3, figsize=(14,12))
axes_ = [axes_row for axes in ax for axes_row in axes]
for i,c in enumerate(df.columns):
    sns.scatterplot(x = "Alcohol", y = df[c], data = df, ax = axes_[i])
    plt.tight_layout()

From the above scatterplot we can see that there is some relation between Alcohol & Proline, Color_Intensity. Also there is one thing to notice as well, as the alcohol increasing the ash alcanity is declinig to a marginal extent.

In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot = True, cmap="Blues")

In [None]:
threshold = 0.6

In [None]:
corr[corr > threshold]

In [None]:
col = ["Proline", "Flavanoids", "Proanthocyanins", "OD280"]

In [None]:
df_new = df

In [None]:
df_new.drop(labels = col, axis = 1, inplace = True)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
se = StandardScaler()

In [None]:
col_n = df_new.columns

In [None]:
se_ = se.fit_transform(df_new)

In [None]:
df_ = pd.DataFrame(se_, columns = col_n)

In [None]:
np.random.seed(42)

In [None]:
clusters = range(2, 16)

In [None]:
inertia = []
sil_score = []
calinski_score =  []
davies_score=  []

In [None]:
for i in clusters:
    kmeans_mod = KMeans(n_clusters = i, init = "k-means++", n_jobs = -1).fit(df_)
    inertia.append(kmeans_mod.inertia_)
    s_score = silhouette_score(df_, kmeans_mod.labels_)
    cal_score = calinski_harabasz_score(df_, kmeans_mod.labels_)
    dav_score = davies_bouldin_score(df_, kmeans_mod.labels_)
    sil_score.append(s_score)
    calinski_score.append(cal_score)
    davies_score.append(dav_score)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
plt.plot(clusters, inertia, marker = 'o', linestyle = '--')
plt.xlabel("Clusters")
plt.ylabel("Inertia")
plt.title("Clusters v/s Inertia")

plt.subplot(122)
plt.plot(clusters, sil_score, marker = 'o', linestyle = '--', color = 'r')
plt.xlabel("Clusters")
plt.ylabel("Silhouette Score")
plt.title("Clusters v/s Silhouette Score")

plt.tight_layout()

Based on the above graph it is clear that the best value for k will be 3 with silhouette score at the highest at cluster 3 but we will be inspecting some other metrics as well to make sure our decision is right.

In [None]:
plt.figure(figsize=(12,6))

plt.subplot(121)
plt.plot(clusters, calinski_score, marker = 'o', linestyle = '--', color = 'g')
plt.xlabel("Clusters")
plt.ylabel("Calsinki Score")
plt.title("Clusters v/s Calinski Score")

plt.subplot(122)
plt.plot(clusters, davies_score, marker = 'o', linestyle = '--', color = 'orange')
plt.xlabel("Clusters")
plt.ylabel("Davies Score")
plt.title("Clusters v/s Davies Score")

plt.tight_layout()

When we check for the Calinski Index it is higher at the cluster 3 which is as same as the clusters suggested by the silhouette score but Davies index suggest the cluster 3 and 15 but we will select the cluster 3 as the calinski score and silhouette score is higher for cluster 3.

**For Your Information:**

**1) Calinski Harbarsz Index:** If the ground truth labels are not known, the Calinski-Harabasz index (sklearn.metrics.calinski_harabasz_score) - also known as the Variance Ratio Criterion - can be used to evaluate the model, where a higher Calinski-Harabasz score relates to a model with better defined clusters.

**2) Davies Bouldin Index:** If the ground truth labels are not known, the Davies-Bouldin index (sklearn.metrics.davies_bouldin_score) can be used to evaluate the model, where a lower Davies-Bouldin index relates to a model with better separation between the clusters.

In [None]:
k_model = KMeans(n_clusters = 3, init = "k-means++", n_jobs = -1 )

In [None]:
label_predict = k_model.fit_predict(df_)

In [None]:
centers = k_model.cluster_centers_

In [None]:
label_df = pd.DataFrame(label_predict, columns = ["Label"])

In [None]:
df_ = pd.concat([df_, label_df], axis = 1)

In [None]:
df_.head()

In [None]:
mapping = {0: 1, 1: 2, 2: 3}

In [None]:
df_["Label"] = df_["Label"].map(mapping)

In [None]:
df_['Label'].value_counts()

In [None]:
df_['Label'].value_counts().plot(kind="bar")

I am new to the Machine Learning & If you like my work, don't forget to **upvote** it!!!