In [1]:
import warnings
from sklearn.metrics import confusion_matrix
import kagglehub
from tqdm import tqdm
import numpy as np
# Download latest version
path = kagglehub.dataset_download("ziadhanyai/fashion-six-classes")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\allex\.cache\kagglehub\datasets\ziadhanyai\fashion-six-classes\versions\1


In [2]:
import os

dataset_path = os.path.join(path, "Fashion")
class_to_int = {}
for idx, class_name in enumerate(os.listdir(dataset_path)):
    class_to_int[class_name] = idx
class_to_int

{'Accessories': 0,
 'Formal': 1,
 'Hoodie': 2,
 'Pants': 3,
 'Shoes': 4,
 'T-Shirt': 5}

In [3]:
import pandas as pd
import cv2 as cv
warnings.filterwarnings("ignore", category=UserWarning)  # Disable UserWarnings
tqdm.pandas()

df = pd.DataFrame(columns=["ImageName", "OriginalImage", "Label"])

for class_name, class_idx in class_to_int.items():
    # put images with labels into dataframe
    img_dir_class = os.path.join(dataset_path, class_name)
    for img_path in tqdm(os.listdir(img_dir_class)):
        full_img_path = os.path.join(img_dir_class, img_path)
        if img_path.endswith(".jpg") or img_path.endswith(".png") or img_path.endswith(".jpeg"):
            img = cv.imread(full_img_path)
            img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
            img = cv.resize(img, (224, 224), interpolation=cv.INTER_CUBIC)
            df.loc[len(df)] = [img_path, img, class_name]

100%|██████████| 1140/1140 [00:21<00:00, 53.90it/s]
100%|██████████| 1064/1064 [00:14<00:00, 71.02it/s]
100%|██████████| 1089/1089 [00:16<00:00, 66.13it/s]
100%|██████████| 1001/1001 [00:16<00:00, 59.56it/s]
100%|██████████| 1050/1050 [00:17<00:00, 58.35it/s]
100%|██████████| 1063/1063 [00:16<00:00, 64.54it/s]


# Image Representation A (Simple flattening of Gray Scaled Images)

In [4]:
df["FlattenedImage"] = df["OriginalImage"].progress_apply(lambda x: cv.cvtColor(x, cv.COLOR_RGB2GRAY).flatten() / 255.0)

100%|██████████| 6343/6343 [00:00<00:00, 6591.72it/s]


# Convert String labels to idx

In [5]:
df['LabelIdx'] = df['Label'].progress_apply(lambda x: class_to_int.get(x, class_to_int.get(np.nan)))

100%|██████████| 6343/6343 [00:00<00:00, 1553182.92it/s]


In [6]:
df.iloc[0]

ImageName                                               Image_1.png
OriginalImage     [[[255, 255, 255], [255, 255, 255], [255, 255,...
Label                                                   Accessories
FlattenedImage    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
LabelIdx                                                          0
Name: 0, dtype: object

# Image Representation B (ResNet embeddings)

In [7]:
import torch
from torchvision import models, transforms

# Load pretrained ResNet model
resnet = models.resnet18(pretrained=True)
resnet.eval()  # Set to evaluation mode
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
# Define image transformation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract features from image
def get_resnet_features(image_array):
    image_tensor = transform(image_array).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():  # No need to track gradients
        features = resnet(image_tensor)  # Get features from the ResNet model
    return features.squeeze().numpy()  # Remove batch dimension and convert to numpy array

# Apply ResNet feature extraction on the "Image" column
df["ResNetFeatures"] = df["OriginalImage"].progress_apply(lambda x: get_resnet_features(x))

100%|██████████| 6343/6343 [01:21<00:00, 77.94it/s]


In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train_flatten, X_val_flatten, y_train_flatten, y_val_flatten = train_test_split(np.array(df['FlattenedImage'].tolist()), df['LabelIdx'], test_size=0.2, random_state=42)
X_train_resnet, X_val_resnet, y_train_resnet, y_val_resnet = train_test_split(np.array(df['ResNetFeatures'].tolist()), df['LabelIdx'], test_size=0.2, random_state=42)

In [9]:
X_train_flatten.shape

(5074, 50176)

In [10]:
from sklearn.decomposition import PCA

# Initialize PCA with the number of components you want to keep.
# For example, to reduce to 50 components:
pca = PCA(n_components=50)

# Fit the PCA on the training data and transform it
X_train_flatten_pca = pca.fit_transform(X_train_flatten)

# Optionally, you can also transform the validation set with the same PCA model
X_val_flatten_pca = pca.transform(X_val_flatten)

# Check the new shape
print("New training data shape:", X_train_flatten_pca.shape)
print("New validation data shape:", X_val_flatten_pca.shape)

New training data shape: (5074, 50)
New validation data shape: (1269, 50)


# DBSCAN

In [11]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_rand_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Function to evaluate DBSCAN performance for a given combination of hyperparameters
def evaluate_dbscan(model, X, y):
    y_pred = model.labels_
    
    # Ignore noise points (label -1)
    mask = y_pred != -1
    X_filtered = X[mask]
    y_filtered = y[mask]
    y_pred_filtered = y_pred[mask]
    
    nr_of_removed_labels = len(y) - len(y_filtered)
    print(f"Removed {nr_of_removed_labels} out of {len(y)} as noise (-1 label)")
    if nr_of_removed_labels > len(y) / 2: # half of labels
        print(f"HALF OF DATA IS REMOVED!")
        return -1, -1, -1, -1, -1
    
    silhouette_avg = -1
    davies_bouldin = -1
    calinski_harabasz = -1
    ari = -1
    purity = -1
    
    if len(np.unique(y_pred_filtered)) > 1:
        # Silhouette Coefficient (higher is better)
        silhouette_avg = silhouette_score(X_filtered, y_pred_filtered)
        print(f"Silhouette Coefficient: {silhouette_avg:.4f}")
        
        # Davies-Bouldin Index (lower is better)
        davies_bouldin = davies_bouldin_score(X_filtered, y_pred_filtered)
        print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
        
        # Calinski-Harabasz Index (higher is better)
        calinski_harabasz = calinski_harabasz_score(X_filtered, y_pred_filtered)
        print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
        
        # Optionally, print ARI for comparison with true labels
        ari = adjusted_rand_score(y_filtered, y_pred_filtered)
        print(f"Adjusted Rand Index (ARI): {ari:.4f}")
        
        cm = confusion_matrix(y, y_pred)
        purity = np.sum(np.amax(cm, axis=0)) / np.sum(cm)
        print(f"Cluster Purity: {purity:.4f}")
        
        print(f"Number of clusters detected: {len(np.unique(y_pred_filtered))}")
    else:
        print("Metrics cannot be computed (only 1 cluster found)")
    
    return silhouette_avg, davies_bouldin, calinski_harabasz, ari, purity

# DBSCAN + Representation A

In [12]:
from sklearn.model_selection import ParameterGrid

# Define the hyperparameter grid for tuning
param_grid = {
    'eps': [30, 35, 40, 45, 50, 60, 70 ,80 ,90, 100, 125, 150, 175],  # Range for eps
    'min_samples': [3, 5, 7, 10, 15]  # Range for min_samples
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_dbscan_respresentation_A = {
    "min_samples": [],
    "eps": [],
    "silhouette_avg": [],
    "davies_bouldin": [],
    "calinski_harabasz": [],
    "ari": [],
    "purity": [],
}

for params in grid:
    eps = params['eps']
    min_samples = params['min_samples']
    
    print(f"Evaluating: eps={eps}, min_samples={min_samples}")

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit_predict(X_train_flatten_pca)
    silhouette_avg, davies_bouldin, calinski_harabasz, ari, purity = evaluate_dbscan(dbscan, X_train_flatten_pca, y_train_flatten)
    
    results_dbscan_respresentation_A['min_samples'].append(min_samples)
    results_dbscan_respresentation_A['eps'].append(eps)
    results_dbscan_respresentation_A['silhouette_avg'].append(silhouette_avg)
    results_dbscan_respresentation_A['davies_bouldin'].append(davies_bouldin)
    results_dbscan_respresentation_A['calinski_harabasz'].append(calinski_harabasz)
    results_dbscan_respresentation_A['ari'].append(ari)
    results_dbscan_respresentation_A['purity'].append(purity)
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: eps=30, min_samples=3
Removed 2480 out of 5074 as noise (-1 label)
Silhouette Coefficient: -0.2357
Davies-Bouldin Index: 1.0712
Calinski-Harabasz Index: 6.0762
Adjusted Rand Index (ARI): 0.0004
Cluster Purity: 0.2708
Number of clusters detected: 106

Evaluating: eps=30, min_samples=5
Removed 2847 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=30, min_samples=7
Removed 3086 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=30, min_samples=10
Removed 3264 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=30, min_samples=15
Removed 3442 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=35, min_samples=3
Removed 1535 out of 5074 as noise (-1 label)
Silhouette Coefficient: -0.2982
Davies-Bouldin Index: 1.0124
Calinski-Harabasz Index: 5.4091
Adjusted Rand Index (ARI): 0.0007
Cluster Purity: 0.2266
Number of clusters detected: 58

Evaluating: eps=35, min_samples=5
Removed 1756 out of

In [31]:
df_results_A = pd.DataFrame(results_dbscan_respresentation_A)
df_results_A.sort_values(by='purity', ascending=False, inplace=True)
df_results_A

Unnamed: 0,min_samples,eps,silhouette_avg,davies_bouldin,calinski_harabasz,ari,purity
0,3,30,-0.235692,1.071215,6.076191,0.000382,0.270792
5,3,35,-0.298233,1.012393,5.409059,0.000742,0.226646
6,5,35,-0.120713,1.185718,8.797441,0.001101,0.217580
7,7,35,-0.016447,1.374017,11.472542,0.001303,0.214426
8,10,35,-0.068112,1.408700,13.131018,-0.000134,0.209894
...,...,...,...,...,...,...,...
34,15,70,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
35,3,80,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
36,5,80,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
37,7,80,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000


# DBSCAN + Representation B

In [15]:
from sklearn.model_selection import ParameterGrid

# Define the hyperparameter grid for tuning
param_grid = {
    'eps': [0.5, 1, 2, 5, 7, 10, 15, 20, 30, 35, 40, 45, 50, 60, 70 ,80 ,90, 100, 125, 150, 175],  # Range for eps
    'min_samples': [3, 5, 7, 10, 15]  # Range for min_samples
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_dbscan_respresentation_B = {
    "min_samples": [],
    "eps": [],
    "silhouette_avg": [],
    "davies_bouldin": [],
    "calinski_harabasz": [],
    "ari": [],
    "purity": [],
    }

for params in grid:
    eps = params['eps']
    min_samples = params['min_samples']
    
    print(f"Evaluating: eps={eps}, min_samples={min_samples}")

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit_predict(X_train_resnet)
    silhouette_avg, davies_bouldin, calinski_harabasz, ari, purity = evaluate_dbscan(dbscan, X_train_resnet, y_train_resnet)
    
    results_dbscan_respresentation_B['min_samples'].append(min_samples)
    results_dbscan_respresentation_B['eps'].append(eps)
    results_dbscan_respresentation_B['silhouette_avg'].append(silhouette_avg)
    results_dbscan_respresentation_B['davies_bouldin'].append(davies_bouldin)
    results_dbscan_respresentation_B['calinski_harabasz'].append(calinski_harabasz)
    results_dbscan_respresentation_B['ari'].append(ari)
    results_dbscan_respresentation_B['purity'].append(purity)
    
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: eps=0.5, min_samples=3
Removed 4966 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=5
Removed 5069 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=15
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=3
Removed 4925 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=5
Removed 5064 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=15
Removed 5074 out 

In [30]:
df_results_B = pd.DataFrame(results_dbscan_respresentation_B)
df_results_B.sort_values(by='purity', ascending=False, inplace=True)
df_results_B

Unnamed: 0,min_samples,eps,silhouette_avg,davies_bouldin,calinski_harabasz,ari,purity
30,3,15.0,-0.030981,1.220836,12.644343,0.100769,0.333859
31,5,15.0,0.023677,1.497789,29.797664,0.087065,0.308435
32,7,15.0,0.094270,1.556802,56.914344,0.073944,0.296413
37,7,20.0,0.200925,1.479313,14.409107,0.000120,0.196689
38,10,20.0,0.199423,1.364804,10.839183,0.000085,0.196492
...,...,...,...,...,...,...,...
33,10,15.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
29,15,10.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
28,10,10.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
27,7,10.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000


# KMEANS

In [17]:
from sklearn.metrics import adjusted_rand_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Function to evaluate DBSCAN performance for a given combination of hyperparameters
def evaluate_kmeans(model, X, y):
    y_pred = model.labels_
    
    silhouette_avg = -1
    davies_bouldin = -1
    calinski_harabasz = -1
    ari = -1
    purity = -1
    
    if len(np.unique(y_pred)) > 1:
        # Silhouette Coefficient (higher is better)
        silhouette_avg = silhouette_score(X, y_pred)
        print(f"Silhouette Coefficient: {silhouette_avg:.4f}")
        
        # Davies-Bouldin Index (lower is better)
        davies_bouldin = davies_bouldin_score(X, y_pred)
        print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
        
        # Calinski-Harabasz Index (higher is better)
        calinski_harabasz = calinski_harabasz_score(X, y_pred)
        print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
        
        # Optionally, print ARI for comparison with true labels
        ari = adjusted_rand_score(y, y_pred)
        print(f"Adjusted Rand Index (ARI): {ari:.4f}")
        
        cm = confusion_matrix(y, y_pred)
        purity = np.sum(np.amax(cm, axis=0)) / np.sum(cm)
        print(f"Cluster Purity: {purity:.4f}")
    else:
        print("Metrics cannot be computed (only 1 cluster found)")
    
    return silhouette_avg, davies_bouldin, calinski_harabasz, ari, purity

# K-Means + Representation A

In [18]:
from sklearn.cluster import KMeans
from sklearn.model_selection import ParameterGrid

# Define parameter grid for KMeans
param_grid = {
    'n_clusters': range(2, 24, 1),  # testing different number of clusters
    'init': ['k-means++', 'random'],  # testing different initialization methods
    'max_iter': [25, 50, 100, 200, 300, 500, 1000],  # different maximum iterations
    'tol': [1e-4, 1e-3],  # different tolerance values
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_kmeans_respresentation_A = {
    "n_clusters": [],
    "init": [],
    "max_iter": [],
    "tol": [],
    "silhouette_avg": [],
    "davies_bouldin": [],
    "calinski_harabasz": [],
    "ari": [],
    "purity": [],
    }

for params in grid:
    n_clusters = params['n_clusters']
    init = params['init']
    max_iter = params['max_iter']
    tol = params['tol']
    
    print(f"Evaluating: n_clusters={n_clusters}, init={init}, max_iter={max_iter}, tol={tol}")

    kmeans = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, tol=tol)
    kmeans.fit_predict(X_train_flatten_pca)
    silhouette_avg, davies_bouldin, calinski_harabasz, ari, purity = evaluate_kmeans(kmeans, X_train_flatten_pca, y_train_flatten)
    
    results_kmeans_respresentation_A['n_clusters'].append(n_clusters)
    results_kmeans_respresentation_A['init'].append(init)
    results_kmeans_respresentation_A['max_iter'].append(max_iter)
    results_kmeans_respresentation_A['tol'].append(tol)
    results_kmeans_respresentation_A['silhouette_avg'].append(silhouette_avg)
    results_kmeans_respresentation_A['davies_bouldin'].append(davies_bouldin)
    results_kmeans_respresentation_A['calinski_harabasz'].append(calinski_harabasz)
    results_kmeans_respresentation_A['ari'].append(ari)
    results_kmeans_respresentation_A['purity'].append(purity)
    
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.2619
Davies-Bouldin Index: 1.4934
Calinski-Harabasz Index: 2070.0413
Adjusted Rand Index (ARI): 0.0131
Cluster Purity: 0.2184

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.2619
Davies-Bouldin Index: 1.4936
Calinski-Harabasz Index: 2070.0349
Adjusted Rand Index (ARI): 0.0132
Cluster Purity: 0.2184

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.2078
Davies-Bouldin Index: 1.7065
Calinski-Harabasz Index: 1634.4388
Adjusted Rand Index (ARI): 0.0248
Cluster Purity: 0.2562

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.2076
Davies-Bouldin Index: 1.7058
Calinski-Harabasz Index: 1634.4227
Adjusted Rand Index (ARI): 0.0249
Cluster Purity: 0.2562

Evaluating: n_clusters=4, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.1586
Davies-Bouldin Index: 2.1276


In [29]:
df_kmeans_results_A = pd.DataFrame(results_kmeans_respresentation_A)
df_kmeans_results_A.sort_values(by='purity', ascending=False, inplace=True)
df_kmeans_results_A

Unnamed: 0,n_clusters,init,max_iter,tol,silhouette_avg,davies_bouldin,calinski_harabasz,ari,purity
42,23,k-means++,25,0.0001,0.101953,2.161823,374.244634,0.055878,0.430824
394,23,random,50,0.0001,0.106054,2.200156,374.732096,0.056897,0.430627
128,22,k-means++,100,0.0001,0.100749,2.222972,383.562667,0.055354,0.429641
568,22,random,500,0.0001,0.102258,2.165721,389.138204,0.057638,0.429247
174,23,k-means++,200,0.0001,0.098677,2.222514,373.647268,0.057478,0.429050
...,...,...,...,...,...,...,...,...,...
440,2,random,200,0.0001,0.262297,1.491061,2070.071705,0.012782,0.217580
484,2,random,300,0.0001,0.262297,1.491061,2070.071705,0.012782,0.217580
485,2,random,300,0.0010,0.262297,1.491061,2070.071705,0.012782,0.217580
528,2,random,500,0.0001,0.262297,1.491061,2070.071705,0.012782,0.217580


# K-Means + Representation B

In [20]:
from sklearn.cluster import KMeans
from sklearn.model_selection import ParameterGrid

# Define parameter grid for KMeans
param_grid = {
    'n_clusters': range(2, 24, 1),  # testing different number of clusters
    'init': ['k-means++', 'random'],  # testing different initialization methods
    'max_iter': [25, 50, 100, 200, 300, 500, 1000],  # different maximum iterations
    'tol': [1e-4, 1e-3],  # different tolerance values
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_kmeans_respresentation_B = {
    "n_clusters": [],
    "init": [],
    "max_iter": [],
    "tol": [],
    "silhouette_avg": [],
    "davies_bouldin": [],
    "calinski_harabasz": [],
    "ari": [],
    "purity": [],
    }

for params in grid:
    n_clusters = params['n_clusters']
    init = params['init']
    max_iter = params['max_iter']
    tol = params['tol']
    
    print(f"Evaluating: n_clusters={n_clusters}, init={init}, max_iter={max_iter}, tol={tol}")

    kmeans = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, tol=tol)
    kmeans.fit_predict(X_train_resnet)
    silhouette_avg, davies_bouldin, calinski_harabasz, ari, purity = evaluate_kmeans(kmeans, X_train_resnet, y_train_resnet)
    
    results_kmeans_respresentation_B['n_clusters'].append(n_clusters)
    results_kmeans_respresentation_B['init'].append(init)
    results_kmeans_respresentation_B['max_iter'].append(max_iter)
    results_kmeans_respresentation_B['tol'].append(tol)
    results_kmeans_respresentation_B['silhouette_avg'].append(silhouette_avg)
    results_kmeans_respresentation_B['davies_bouldin'].append(davies_bouldin)
    results_kmeans_respresentation_B['calinski_harabasz'].append(calinski_harabasz)
    results_kmeans_respresentation_B['ari'].append(ari)
    results_kmeans_respresentation_B['purity'].append(purity)
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.1156
Davies-Bouldin Index: 3.0747
Calinski-Harabasz Index: 483.5906
Adjusted Rand Index (ARI): 0.2160
Cluster Purity: 0.3183

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.1159
Davies-Bouldin Index: 3.0726
Calinski-Harabasz Index: 483.5920
Adjusted Rand Index (ARI): 0.2154
Cluster Purity: 0.3179

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.0642
Davies-Bouldin Index: 3.1442
Calinski-Harabasz Index: 370.4124
Adjusted Rand Index (ARI): 0.2804
Cluster Purity: 0.4028

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.0607
Davies-Bouldin Index: 3.5126
Calinski-Harabasz Index: 355.6024
Adjusted Rand Index (ARI): 0.2564
Cluster Purity: 0.3855

Evaluating: n_clusters=4, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.0956
Davies-Bouldin Index: 2.9914
Cali

In [28]:
df_kmeans_results_B = pd.DataFrame(results_kmeans_respresentation_B)
df_kmeans_results_B.sort_values(by='purity', ascending=False, inplace=True)
df_kmeans_results_B[:20]

Unnamed: 0,n_clusters,init,max_iter,tol,silhouette_avg,davies_bouldin,calinski_harabasz,ari,purity
258,21,k-means++,500,0.0001,0.047118,3.16344,122.77612,0.307243,0.854158
123,19,k-means++,100,0.001,0.043323,3.254476,130.151052,0.328952,0.851399
126,21,k-means++,100,0.0001,0.044256,3.335295,121.426942,0.309203,0.849428
476,20,random,200,0.0001,0.043663,3.307852,125.129874,0.315505,0.84864
432,20,random,100,0.0001,0.04561,3.246292,125.939433,0.308132,0.846472
562,19,random,500,0.0001,0.04488,3.236919,131.220019,0.316601,0.846078
522,21,random,300,0.0001,0.048697,3.246206,122.277157,0.309966,0.846078
260,22,k-means++,500,0.0001,0.045142,3.272612,118.170643,0.298442,0.844698
42,23,k-means++,25,0.0001,0.041459,3.440719,112.848314,0.269866,0.844501
391,21,random,50,0.001,0.043556,3.390932,121.712619,0.297742,0.844501


In [None]:
from matplotlib import pyplot as plt

from sklearn.cluster import KMeans
import seaborn as sns

# Fit the K-Means model on training data
kmeans = KMeans(n_clusters=6, random_state=42)  # Example with 3 clusters
kmeans.fit(X_train_resnet)

# Get the cluster labels for each sample
cluster_labels = kmeans.labels_

# Create a confusion matrix comparing the true labels (y_train_resnet) and predicted cluster labels
cm = confusion_matrix(y_train_resnet, cluster_labels)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(cluster_labels), yticklabels=np.unique(y_train_resnet))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Compute cluster purity
purity = np.sum(np.amax(cm, axis=0)) / np.sum(cm)

print(f"Cluster Purity: {purity}")