In [1]:
#For Supervised evaluation using the GINI score of the clusters you obtain from k means with k=3 with respect to the labels

from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import numpy as np

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Train a K-means clustering model with k=3
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

# Assign each sample to its nearest centroid and create a list of labels for each cluster
labels = [[] for i in range(3)]
for i in range(len(X)):
    label = kmeans.labels_[i]
    labels[label].append(y[i])

# Calculate the Gini score for each cluster
gini_scores = []
for i in range(3):
    cluster_size = len(labels[i])
    if cluster_size > 0:
        p_i = np.array([np.sum(labels[i] == j) for j in np.unique(y)]) / cluster_size
        gini_score = 1 - np.sum(p_i ** 2)
    else:
        gini_score = 0
    gini_scores.append(gini_score)

print("Gini scores for the clusters:", gini_scores)

Gini scores for the clusters: [0.34963579604578565, 0.0, 0.09972299168975085]


In [None]:
#Cluster 1 has the highest Gini score, indicating that it is the most homogeneous cluster with respect to the true labels of the samples. This means that most of the samples in this cluster belong to the same class.
#Cluster 2 has a Gini score of 0, which means that all the samples in this cluster belong to the same class.
#Cluster 3 has a lower Gini score than Cluster 1, indicating that it is less homogeneous with respect to the true labels of the samples. 