# <center style='color:deeppink'> Internal Cluster Validation Using `ClustersFeatures` Module </center>

## 1. Import required libraries

In [1]:
import ClustersFeatures
print('ClustersFeatures version:', ClustersFeatures.__version__)
from ClustersFeatures import ClustersCharacteristics

from sklearn.cluster import KMeans
from sklearn import datasets

import pandas as pd

ClustersFeatures version: 1.0.3


## 2. Create dataset 

In [2]:
features, _ = datasets.make_classification(n_samples=250, n_features=4, random_state=1)

##### `Internal cluster validation` is applicaple in situations where ground truth information is absent. Hence, we didn't generate any labels while creating our dataset. 

In [3]:
features.shape # (samples, features)

(250, 4)

## 3. Perform K-Means clustering considering 2 clusters

In [4]:
df1 = pd.DataFrame(features)

kmeans1 = KMeans(n_clusters=2, n_init='auto', random_state=1)
kmeans1.fit(df1)
labels1 = kmeans1.labels_

df1['Labels1'] = labels1
print(df1.shape)
print(df1.Labels1.unique())
cc1 = ClustersCharacteristics(df1, label_target='Labels1')
df1.head()

(250, 5)
[1 0]


Unnamed: 0,0,1,2,3,Labels1
0,-0.281532,-0.30315,-1.217345,1.024123,1
1,-0.21818,-0.200283,-1.119217,0.897119,1
2,-0.598217,-0.786243,-1.86578,1.751918,1
3,-1.194223,-2.003848,-1.521355,2.20087,1
4,-0.820275,-2.00325,2.135508,-0.359775,1


## 4. Perform K-Means clustering considering 3 clusters

In [5]:
df2 = pd.DataFrame(features)

kmeans2 = KMeans(n_clusters=3, n_init='auto', random_state=1)
kmeans2.fit(df2)
labels2 = kmeans2.labels_

df2['Labels2'] = labels2
print(df2.shape)
print(df2.Labels2.unique())
cc2 = ClustersCharacteristics(df2, label_target='Labels2')
df2.head()

(250, 5)
[1 0 2]


Unnamed: 0,0,1,2,3,Labels2
0,-0.281532,-0.30315,-1.217345,1.024123,1
1,-0.21818,-0.200283,-1.119217,0.897119,1
2,-0.598217,-0.786243,-1.86578,1.751918,1
3,-1.194223,-2.003848,-1.521355,2.20087,1
4,-0.820275,-2.00325,2.135508,-0.359775,0


## 5. Calculate `Ball-Hall` index

In [6]:
print('Ball-Hall index for 2-clusters:', round(cc1.score_index_ball_hall(), 5))
print('Ball-Hall index for 3-clusters:', round(cc2.score_index_ball_hall(), 5))

Ball-Hall index for 2-clusters: 3.04258
Ball-Hall index for 3-clusters: 1.91677


### Lower value of `Ball-Hall` index means better clustering.

## 6. Calculate `Banfeld-Raftery` index

In [7]:
print('Banfeld-Raftery index for 2-clusters:', round(cc1.score_index_banfeld_Raftery(), 5))
print('Banfeld-Raftery index for 3-clusters:', round(cc2.score_index_banfeld_Raftery(), 5))

Banfeld-Raftery index for 2-clusters: 276.62073
Banfeld-Raftery index for 3-clusters: 160.51897


### Lower value of `Banfeld-Raftery` index means better clustering.

## 7. Calculate `Ray-Turi` index

In [8]:
print('Ray-Turi index for 2-clusters:', round(cc1.score_index_ray_turi(), 5))
print('Ray-Turi index for 3-clusters:', round(cc2.score_index_ray_turi(), 5))

Ray-Turi index for 2-clusters: 0.33844
Ray-Turi index for 3-clusters: 0.22495


### Lower value of `Ray-Turi` index means better clustering.

## 8. Calculate `Xie-Beni` index 

In [9]:
print('Xie-Beni index for 2-clusters:', round(cc1.score_index_xie_beni(), 5))
print('Xie-Beni index for 3-clusters:', round(cc2.score_index_xie_beni(), 5))

Xie-Beni index for 2-clusters: 496.54651
Xie-Beni index for 3-clusters: 126.40268


### Lower value of `Xie-Beni` index means better clustering.

## 9. Calculate `C` index

In [10]:
print('C index for 2-clusters:', round(cc1.score_index_c(), 5))
print('C index for 3-clusters:', round(cc2.score_index_c(), 5))

C index for 2-clusters: 0.1962
C index for 3-clusters: 0.11912


### Lower value of `C` index means better clustering.

## 10. Calculate `Mclain-Rao` index 

In [11]:
print('Mclain-Rao index for 2-clusters:', round(cc1.score_index_mclain_rao(), 5))
print('Mclain-Rao index for 3-clusters:', round(cc2.score_index_mclain_rao(), 5))

Mclain-Rao index for 2-clusters: 0.59801
Mclain-Rao index for 3-clusters: 0.48652


### Lower value of `Mclain-Rao` index means better clustering.

## 11. Calculate `Wemmert-Gancarski` index 

In [12]:
print('Wemmert-Gancarski index for 2-clusters:', round(cc1.score_index_wemmert_gancarski(), 5))
print('Wemmert-Gancarski index for 3-clusters:', round(cc2.score_index_wemmert_gancarski(), 5))

Wemmert-Gancarski index for 2-clusters: 0.50183
Wemmert-Gancarski index for 3-clusters: 0.55066


### Higher value of `Wemmert-Gancarski` index means better clustering.

## 12. Calculate `PBM` index 

In [13]:
print('PBM index for 2-clusters:', round(cc1.score_index_PBM(), 5))
print('PBM index for 3-clusters:', round(cc2.score_index_PBM(), 5))

PBM index for 2-clusters: 3.95345
PBM index for 3-clusters: 3.97818


### Higher value of `PBM` index means better clustering.