In [1]:
import warnings

import kagglehub
from tqdm import tqdm

# Download latest version
path = kagglehub.dataset_download("ziadhanyai/fashion-six-classes")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\allex\.cache\kagglehub\datasets\ziadhanyai\fashion-six-classes\versions\1


In [2]:
import os

dataset_path = os.path.join(path, "Fashion")
class_to_int = {}
for idx, class_name in enumerate(os.listdir(dataset_path)):
    class_to_int[class_name] = idx
class_to_int

{'Accessories': 0,
 'Formal': 1,
 'Hoodie': 2,
 'Pants': 3,
 'Shoes': 4,
 'T-Shirt': 5}

In [3]:
import pandas as pd
import cv2 as cv
warnings.filterwarnings("ignore", category=UserWarning)  # Disable UserWarnings
tqdm.pandas()

df = pd.DataFrame(columns=["ImageName", "OriginalImage", "Label"])

for class_name, class_idx in class_to_int.items():
    # put images with labels into dataframe
    img_dir_class = os.path.join(dataset_path, class_name)
    for img_path in tqdm(os.listdir(img_dir_class)):
        full_img_path = os.path.join(img_dir_class, img_path)
        if img_path.endswith(".jpg") or img_path.endswith(".png") or img_path.endswith(".jpeg"):
            img = cv.imread(full_img_path)
            img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
            img = cv.resize(img, (224, 224), interpolation=cv.INTER_CUBIC)
            df.loc[len(df)] = [img_path, img, class_name]

100%|██████████| 1140/1140 [00:21<00:00, 53.93it/s]
100%|██████████| 1064/1064 [00:15<00:00, 70.83it/s]
100%|██████████| 1089/1089 [00:16<00:00, 66.29it/s]
100%|██████████| 1001/1001 [00:17<00:00, 57.49it/s]
100%|██████████| 1050/1050 [00:18<00:00, 57.68it/s]
100%|██████████| 1063/1063 [00:16<00:00, 62.60it/s]


In [4]:
df.iloc[0]

ImageName                                              Image_1.png
OriginalImage    [[[255, 255, 255], [255, 255, 255], [255, 255,...
Label                                                  Accessories
Name: 0, dtype: object

# Image Representation A (Simple flattening of Gray Scaled Images)

In [5]:
df["FlattenedImage"] = df["OriginalImage"].progress_apply(lambda x: cv.cvtColor(x, cv.COLOR_RGB2GRAY).flatten() / 255.0)

100%|██████████| 6343/6343 [00:01<00:00, 6224.77it/s]


# Image Representation B (ResNet embeddings)

In [6]:
import torch
from torchvision import models, transforms

# Load pretrained ResNet model
resnet = models.resnet18(pretrained=True)
resnet.eval()  # Set to evaluation mode
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
# Define image transformation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract features from image
def get_resnet_features(image_array):
    image_tensor = transform(image_array).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():  # No need to track gradients
        features = resnet(image_tensor)  # Get features from the ResNet model
    return features.squeeze().numpy()  # Remove batch dimension and convert to numpy array

# Apply ResNet feature extraction on the "Image" column
df["ResNetFeatures"] = df["OriginalImage"].progress_apply(lambda x: get_resnet_features(x))

100%|██████████| 6343/6343 [01:17<00:00, 81.37it/s]


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train_flatten, X_val_flatten, y_train_flatten, y_val_flatten = train_test_split(np.array(df['FlattenedImage'].tolist()), df['Label'], test_size=0.2, random_state=42)
X_train_resnet, X_val_resnet, y_train_resnet, y_val_resnet = train_test_split(np.array(df['ResNetFeatures'].tolist()), df['Label'], test_size=0.2, random_state=42)

# DBSCAN

In [46]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_rand_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Function to evaluate DBSCAN performance for a given combination of hyperparameters
def evaluate_dbscan(model, X, y):
    y_pred = model.labels_
    
    # Ignore noise points (label -1)
    mask = y_pred != -1
    X_filtered = X[mask]
    y_filtered = y[mask]
    y_pred_filtered = y_pred[mask]
    
    nr_of_removed_labels = len(y) - len(y_filtered)
    print(f"Removed {nr_of_removed_labels} out of {len(y)} as noise (-1 label)")
    if nr_of_removed_labels > len(y) / 2: # half of labels
        print(f"HALF OF DATA IS REMOVED!")
        return -1
    silhouette_avg = None
    if len(np.unique(y_pred_filtered)) > 1:
        # Silhouette Coefficient (higher is better)
        silhouette_avg = silhouette_score(X_filtered, y_pred_filtered)
        print(f"Silhouette Coefficient: {silhouette_avg:.4f}")
        
        # Davies-Bouldin Index (lower is better)
        davies_bouldin = davies_bouldin_score(X_filtered, y_pred_filtered)
        print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
        
        # Calinski-Harabasz Index (higher is better)
        calinski_harabasz = calinski_harabasz_score(X_filtered, y_pred_filtered)
        print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
        
        # Optionally, print ARI for comparison with true labels
        ari = adjusted_rand_score(y_filtered, y_pred_filtered)
        print(f"Adjusted Rand Index (ARI): {ari:.4f}")
        
        print(f"Number of clusters detected: {len(np.unique(y_pred_filtered))}")
    else:
        print("Metrics cannot be computed (only 1 cluster found)")
    
    if silhouette_avg is not None:
        return silhouette_avg
    else:
        return -1

# DBSCAN + Representation A

In [47]:
from sklearn.model_selection import ParameterGrid

# Define the hyperparameter grid for tuning
param_grid = {
    'eps': [30, 35, 40, 45, 50, 60, 70 ,80 ,90, 100, 125, 150, 175],  # Range for eps
    'min_samples': [3, 5, 7, 10, 15]  # Range for min_samples
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters

for params in grid:
    eps = params['eps']
    min_samples = params['min_samples']
    
    print(f"Evaluating: eps={eps}, min_samples={min_samples}")
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit_predict(X_train_flatten)
    silhouette_avg = evaluate_dbscan(dbscan, X_train_flatten, y_train_flatten)
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest ARI: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: eps=30, min_samples=3
Removed 4333 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=30, min_samples=5
Removed 4646 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=30, min_samples=7
Removed 4721 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=30, min_samples=10
Removed 4731 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=30, min_samples=15
Removed 4742 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=35, min_samples=3
Removed 4059 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=35, min_samples=5
Removed 4364 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=35, min_samples=7
Removed 4442 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=35, min_samples=10
Removed 4478 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=35, min_samples=15
Removed 4514 out 

# DBSCAN + Representation B

In [49]:
from sklearn.model_selection import ParameterGrid

# Define the hyperparameter grid for tuning
param_grid = {
    'eps': [0.5, 1, 2, 3, 4, 5, 7, 10, 12.5, 15, 20, 30],  # Range for eps
    'min_samples': [3, 5, 7, 10, 15]  # Range for min_samples
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters

for params in grid:
    eps = params['eps']
    min_samples = params['min_samples']
    
    print(f"Evaluating: eps={eps}, min_samples={min_samples}")
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit_predict(X_train_resnet)
    silhouette_avg = evaluate_dbscan(dbscan, X_train_resnet, y_train_resnet)
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest ARI: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: eps=0.5, min_samples=3
Removed 4966 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=5
Removed 5069 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=15
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=3
Removed 4925 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=5
Removed 5064 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=15
Removed 5074 out 

# K-Means

In [54]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(X_train_flatten)
y_pred = kmeans.predict(X_val_flatten)
print(y_pred)

[5 3 0 ... 4 3 2]


In [55]:
set(y_pred)

{0, 1, 2, 3, 4, 5}

# K-Means + Representation A

In [63]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for KMeans
param_grid = {
    'n_clusters': range(2, 11, 1),  # testing different number of clusters
    'init': ['k-means++', 'random'],  # testing different initialization methods
    'max_iter': [300, 500, 1000],  # different maximum iterations
    'tol': [1e-4, 1e-3],  # different tolerance values
}

# Initialize KMeans
kmeans = KMeans(random_state=42)

# GridSearchCV - with kmeans as the estimator and param_grid as hyperparameters to tune
grid_search = GridSearchCV(estimator=kmeans, param_grid=param_grid, cv=2, verbose=1, n_jobs=4)

# Perform grid search on training data
grid_search.fit(X_train_resnet)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Use the best model to predict on validation data
y_pred = best_model.predict(X_val_resnet)

# Print the best parameters found
print("Best parameters found:", best_params)

Fitting 2 folds for each of 108 candidates, totalling 216 fits
Best parameters found: {'init': 'random', 'max_iter': 300, 'n_clusters': 10, 'tol': 0.001}


In [None]:
param_grid = {
    'n_clusters': np.arange(2, 11),  # testing different number of clusters
    'init': ['k-means++', 'random'],  # testing different initialization methods
    'max_iter': [300, 500, 1000],  # different maximum iterations
    'tol': [1e-4, 1e-3],  # different tolerance values
}

grid = ParameterGrid(param_grid)

best_silhouette_avg = -1
best_params = None

for params in grid:
    n_clusters = params['n_clusters']
    
    print(f"Evaluating: n_clusters={n_clusters}")
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X_train_flatten)
    cluster_labels = kmeans.predict(X_train_flatten)
    print()

# K-Means + Representation B