# <center style='color:navy'> Internal Cluster Validation: `Dunn index (DI)` </center>

# 1. Import required libraries

In [1]:
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

# 2. Create dataset 

In [2]:
features, _ = datasets.make_classification(n_samples=150, n_features=4, random_state=32)

#### `Internal cluster validation` is applicaple in situations where ground truth information is absent. Hence, we didn't generate any labels while creating our dataset. 

In [3]:
features.shape # (samples, features)

(150, 4)

# 3. Perform preprocessing

In [4]:
scaler = StandardScaler()
scaled = scaler.fit_transform(features)

# 4. Create a dataframe

In [5]:
df = pd.DataFrame(scaled)
df.head()

Unnamed: 0,0,1,2,3
0,-0.710192,-0.5098,-0.40841,-0.746329
1,0.939765,-0.539092,-0.717751,0.815132
2,2.099509,0.03246,-0.321335,1.996811
3,-0.476136,0.793959,0.90357,-0.338987
4,-0.684644,-1.358334,-1.29237,-0.842655


In [6]:
df.shape

(150, 4)

# 5. Perform K-Means clustering considering 3 clusters

In [7]:
kmeans1 = KMeans(n_clusters=3, n_init='auto', random_state=32)
kmeans1.fit(df)
preds1 = kmeans1.labels_

# 6. Perform K-Means clustering considering 4 clusters

In [8]:
kmeans2 = KMeans(n_clusters=4, n_init='auto', random_state=32)
kmeans2.fit(df)
preds2 = kmeans2.labels_

# 7. Add two new columns to the dataframe

In [9]:
df['Prediction1'] = preds1
df['Prediction2'] = preds2
df.head()

Unnamed: 0,0,1,2,3,Prediction1,Prediction2
0,-0.710192,-0.5098,-0.40841,-0.746329,0,3
1,0.939765,-0.539092,-0.717751,0.815132,2,2
2,2.099509,0.03246,-0.321335,1.996811,2,2
3,-0.476136,0.793959,0.90357,-0.338987,1,3
4,-0.684644,-1.358334,-1.29237,-0.842655,0,0


In [10]:
print(df['Prediction1'].unique())
print(df['Prediction2'].unique())

[0 2 1]
[3 2 0 1]


The two lines shown above are displaying the unique labels present in the `K-Means` clustering outcome considering 3 and 4 clusters, respectively.

# 8. Extract different clusters and create separate dataframes

In [11]:
C3_1 = df[df.Prediction1 == 0]
C3_2 = df[df.Prediction1 == 1]
C3_3 = df[df.Prediction1 == 2]

In [12]:
C4_1 = df[df.Prediction2 == 0]
C4_2 = df[df.Prediction2 == 1]
C4_3 = df[df.Prediction2 == 2]
C4_4 = df[df.Prediction2 == 3]

# 9. Drop the labels from the clusters

In [13]:
clus3_1 = C3_1.iloc[:, :-2].values
clus3_2 = C3_2.iloc[:, :-2].values
clus3_3 = C3_3.iloc[:, :-2].values

In [14]:
clus4_1 = C4_1.iloc[:, :-2].values
clus4_2 = C4_2.iloc[:, :-2].values
clus4_3 = C4_3.iloc[:, :-2].values
clus4_4 = C4_4.iloc[:, :-2].values

# 10. Create a list of all the clusters

In [15]:
C3_list = [clus3_1, clus3_2, clus3_3]
C4_list = [clus4_1, clus4_2, clus4_3, clus4_4]

# 11. Custom function for calculating `Euclidean` distance

In [16]:
def euclidean(x, y):
    return (np.sqrt(np.sum(np.square(x - y))))

# 12. Custom function for calculating `inter-cluster` distance

In [17]:
# inter[a, b] ---> a & b are denoting row and column positions in inter

def inter_cluster(x, y): # distance between clusters
    inter = np.ones([len(x), len(y)])
    for a in range(len(x)):
        for b in range(len(y)):
            inter[a, b] = euclidean(x[a], y[b])
    return np.min(inter)

# 13. Custom function for calculating `intra-cluster` distance

In [18]:
# intra[a, b] ---> a & b are denoting row and column positions in intra

def intra_cluster(x): # distance within the same cluster
    intra = np.zeros([len(x), len(x)])
    for a in range(len(x)):
        for b in range(len(x)):
            intra[a, b] = euclidean(x[a], x[b])
    return np.max(intra)

# 14. Calculate `Dunn index (DI)`

In [19]:
def dunn(_list_):
    inter = np.ones([len(_list_), len(_list_)])
    intra = np.zeros([len(_list_), 1])
    clus_range = list(range(len(_list_)))
    for a in clus_range:
        for b in (clus_range[0:a] + clus_range[a+1:]):
            inter[a, b] = inter_cluster(_list_[a], _list_[b])
            intra[a] = intra_cluster(_list_[a])
            DI = np.min(inter) / np.max(intra)
    return DI

In [20]:
print('Dunn index (DI) value for 3-clusters:', dunn(C3_list))
print('Dunn index (DI) value for 4-clusters:', dunn(C4_list))

Dunn index (DI) value for 3-clusters: 0.03291829616950834
Dunn index (DI) value for 4-clusters: 0.04968299613527416


## A higher value of `Dunn index (DI)` indicates better clustering