In [1]:
import warnings

import cv2
from sklearn.metrics import confusion_matrix
import kagglehub
from tqdm import tqdm
import numpy as np
# Download latest version
path = kagglehub.dataset_download("ziadhanyai/fashion-six-classes")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\allex\.cache\kagglehub\datasets\ziadhanyai\fashion-six-classes\versions\1


In [2]:
import os

dataset_path = os.path.join(path, "Fashion")
class_to_int = {}
for idx, class_name in enumerate(os.listdir(dataset_path)):
    class_to_int[class_name] = idx
class_to_int

{'Accessories': 0,
 'Formal': 1,
 'Hoodie': 2,
 'Pants': 3,
 'Shoes': 4,
 'T-Shirt': 5}

In [3]:
import pandas as pd
import cv2 as cv
warnings.filterwarnings("ignore", category=UserWarning)  # Disable UserWarnings
tqdm.pandas()

df = pd.DataFrame(columns=["ImageName", "OriginalImage", "Label"])

for class_name, class_idx in class_to_int.items():
    # put images with labels into dataframe
    img_dir_class = os.path.join(dataset_path, class_name)
    for img_path in tqdm(os.listdir(img_dir_class)):
        full_img_path = os.path.join(img_dir_class, img_path)
        if img_path.endswith(".jpg") or img_path.endswith(".png") or img_path.endswith(".jpeg"):
            img = cv.imread(full_img_path)
            img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
            img = cv.resize(img, (224, 224), interpolation=cv.INTER_CUBIC)
            df.loc[len(df)] = [img_path, img, class_name]

100%|██████████| 1140/1140 [00:21<00:00, 53.59it/s]
100%|██████████| 1064/1064 [00:15<00:00, 70.92it/s]
100%|██████████| 1089/1089 [00:16<00:00, 66.47it/s]
100%|██████████| 1001/1001 [00:16<00:00, 59.05it/s]
100%|██████████| 1050/1050 [00:18<00:00, 57.91it/s]
100%|██████████| 1063/1063 [00:16<00:00, 63.38it/s]


# Image Representation A (Simple flattening of Gray Scaled Images)

In [4]:
df["FlattenedImage"] = df["OriginalImage"].progress_apply(lambda x: cv.cvtColor(x, cv.COLOR_RGB2GRAY).flatten() / 255.0)

100%|██████████| 6343/6343 [00:00<00:00, 7253.12it/s]


# Convert String labels to idx

In [5]:
df['LabelIdx'] = df['Label'].progress_apply(lambda x: class_to_int.get(x, class_to_int.get(np.nan)))

100%|██████████| 6343/6343 [00:00<00:00, 1407643.93it/s]


In [6]:
df.iloc[0]

ImageName                                               Image_1.png
OriginalImage     [[[255, 255, 255], [255, 255, 255], [255, 255,...
Label                                                   Accessories
FlattenedImage    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
LabelIdx                                                          0
Name: 0, dtype: object

# Image Representation B (ResNet embeddings)

In [7]:
import torch
from torchvision import models, transforms

# Load pretrained ResNet model
resnet = models.resnet18(pretrained=True)
resnet.eval()  # Set to evaluation mode
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
# Define image transformation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract features from image
def get_resnet_features(image_array):
    image_tensor = transform(image_array).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():  # No need to track gradients
        features = resnet(image_tensor)  # Get features from the ResNet model
    return features.squeeze().numpy()  # Remove batch dimension and convert to numpy array

# Apply ResNet feature extraction on the "Image" column
df["ResNetFeatures"] = df["OriginalImage"].progress_apply(lambda x: get_resnet_features(x))

100%|██████████| 6343/6343 [01:18<00:00, 81.31it/s]


In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train_flatten, X_val_flatten, y_train_flatten, y_val_flatten = train_test_split(np.array(df['FlattenedImage'].tolist()), df['LabelIdx'], test_size=0.2, random_state=42)
X_train_resnet, X_val_resnet, y_train_resnet, y_val_resnet = train_test_split(np.array(df['ResNetFeatures'].tolist()), df['LabelIdx'], test_size=0.2, random_state=42)

In [9]:
X_train_flatten.shape

(5074, 50176)

In [10]:
from sklearn.decomposition import PCA

# Initialize PCA with the number of components you want to keep.
# For example, to reduce to 50 components:
pca = PCA(n_components=50)

# Fit the PCA on the training data and transform it
X_train_flatten_pca = pca.fit_transform(X_train_flatten)

# Optionally, you can also transform the validation set with the same PCA model
X_val_flatten_pca = pca.transform(X_val_flatten)

# Check the new shape
print("New training data shape:", X_train_flatten_pca.shape)
print("New validation data shape:", X_val_flatten_pca.shape)

New training data shape: (5074, 50)
New validation data shape: (1269, 50)


# DBSCAN

In [11]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Function to evaluate DBSCAN performance for a given combination of hyperparameters
def evaluate_dbscan(model, X, y):
    y_pred = model.labels_
    
    # Ignore noise points (label -1)
    mask = y_pred != -1
    X_filtered = X[mask]
    y_filtered = y[mask]
    y_pred_filtered = y_pred[mask]
    
    nr_of_removed_labels = len(y) - len(y_filtered)
    print(f"Removed {nr_of_removed_labels} out of {len(y)} as noise (-1 label)")
    if nr_of_removed_labels > len(y) / 2: # half of labels
        print(f"HALF OF DATA IS REMOVED!")
        return -1, -1
    
    silhouette_avg = -1
    purity = -1
    
    if len(np.unique(y_pred_filtered)) > 1:
        # Silhouette Coefficient (higher is better)
        silhouette_avg = silhouette_score(X_filtered, y_pred_filtered)
        print(f"Silhouette Coefficient: {silhouette_avg:.4f}")
        
        cm = confusion_matrix(y, y_pred)
        purity = np.sum(np.amax(cm, axis=0)) / np.sum(cm)
        print(f"Cluster Purity: {purity:.4f}")
        
        print(f"Number of clusters detected: {len(np.unique(y_pred_filtered))}")
    else:
        print("Metrics cannot be computed (only 1 cluster found)")
    
    return silhouette_avg, purity

# DBSCAN + Representation A

In [23]:
from sklearn.model_selection import ParameterGrid

# Define the hyperparameter grid for tuning
param_grid = {
    'eps': [0.5, 1, 2, 5, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 40, 45, 50, 60, 70 ,80 ,90, 100, 125, 150, 175],  # Range for eps
    'min_samples': [3, 5, 7, 10, 15]  # Range for min_samples
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_dbscan_respresentation_A = {
    "min_samples": [],
    "eps": [],
    "silhouette_avg": [],
    "purity": [],
}

for params in grid:
    eps = params['eps']
    min_samples = params['min_samples']
    
    print(f"Evaluating: eps={eps}, min_samples={min_samples}")

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit_predict(X_train_flatten_pca)
    silhouette_avg, purity = evaluate_dbscan(dbscan, X_train_flatten_pca, y_train_flatten)
    
    results_dbscan_respresentation_A['min_samples'].append(min_samples)
    results_dbscan_respresentation_A['eps'].append(eps)
    results_dbscan_respresentation_A['silhouette_avg'].append(silhouette_avg)
    results_dbscan_respresentation_A['purity'].append(purity)
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: eps=0.5, min_samples=3
Removed 4844 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=5
Removed 5048 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=15
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=3
Removed 4819 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=5
Removed 5047 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=15
Removed 5074 out 

In [25]:
df_results_A = pd.DataFrame(results_dbscan_respresentation_A)
df_results_A.sort_values(by='purity', ascending=False, inplace=True)
df_results_A[:10]

Unnamed: 0,min_samples,eps,silhouette_avg,purity
80,3,30.0,-0.2356,0.270792
85,3,40.0,-0.205407,0.200828
86,5,40.0,-0.050286,0.191959
88,10,40.0,0.186212,0.189988
87,7,40.0,0.18141,0.188412
90,3,45.0,-0.161926,0.186441
91,5,45.0,0.065727,0.182302
92,7,45.0,0.342114,0.180528
95,3,50.0,-0.053863,0.178952
96,5,50.0,0.093621,0.17836


# DBSCAN + Representation B

In [21]:
from sklearn.model_selection import ParameterGrid

# Define the hyperparameter grid for tuning
param_grid = {
    'eps': [0.5, 1, 2, 5, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 40],  # Range for eps
    'min_samples': [3, 5, 7, 10, 15]  # Range for min_samples
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_dbscan_respresentation_B = {
    "min_samples": [],
    "eps": [],
    "silhouette_avg": [],
    "purity": [],
    }

for params in grid:
    eps = params['eps']
    min_samples = params['min_samples']
    
    print(f"Evaluating: eps={eps}, min_samples={min_samples}")

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit_predict(X_train_resnet)
    silhouette_avg, purity = evaluate_dbscan(dbscan, X_train_resnet, y_train_resnet)
    
    results_dbscan_respresentation_B['min_samples'].append(min_samples)
    results_dbscan_respresentation_B['eps'].append(eps)
    results_dbscan_respresentation_B['silhouette_avg'].append(silhouette_avg)
    results_dbscan_respresentation_B['purity'].append(purity)
    
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: eps=0.5, min_samples=3
Removed 4966 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=5
Removed 5069 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=0.5, min_samples=15
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=3
Removed 4925 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=5
Removed 5064 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=7
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=10
Removed 5074 out of 5074 as noise (-1 label)
HALF OF DATA IS REMOVED!

Evaluating: eps=1, min_samples=15
Removed 5074 out 

In [22]:
df_results_B = pd.DataFrame(results_dbscan_respresentation_B)
df_results_B.sort_values(by='purity', ascending=False, inplace=True)
df_results_B[:10]

Unnamed: 0,min_samples,eps,silhouette_avg,purity
45,3,15.0,-0.030981,0.333859
46,5,15.0,0.023677,0.308435
47,7,15.0,0.09427,0.296413
54,15,16.0,0.155289,0.264091
50,3,16.0,-0.094427,0.25877
51,5,16.0,0.037005,0.24931
52,7,16.0,0.041877,0.247536
55,3,17.0,-0.074581,0.243989
56,5,17.0,0.129115,0.233149
60,3,18.0,-0.041344,0.23039


# KMEANS

In [16]:
from sklearn.metrics import silhouette_score

# Function to evaluate DBSCAN performance for a given combination of hyperparameters
def evaluate_kmeans(model, X, y):
    y_pred = model.labels_
    
    silhouette_avg = -1
    purity = -1
    
    if len(np.unique(y_pred)) > 1:
        # Silhouette Coefficient (higher is better)
        silhouette_avg = silhouette_score(X, y_pred)
        print(f"Silhouette Coefficient: {silhouette_avg:.4f}")
        
        cm = confusion_matrix(y, y_pred)
        purity = np.sum(np.amax(cm, axis=0)) / np.sum(cm)
        print(f"Cluster Purity: {purity:.4f}")
    else:
        print("Metrics cannot be computed (only 1 cluster found)")
    
    return silhouette_avg, purity

# K-Means + Representation A

In [17]:
from sklearn.cluster import KMeans
from sklearn.model_selection import ParameterGrid

# Define parameter grid for KMeans
param_grid = {
    'n_clusters': range(2, 24, 1),  # testing different number of clusters
    'init': ['k-means++', 'random'],  # testing different initialization methods
    'max_iter': [25, 50, 100, 200, 300, 500, 1000],  # different maximum iterations
    'tol': [1e-4, 1e-3],  # different tolerance values
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_kmeans_respresentation_A = {
    "n_clusters": [],
    "init": [],
    "max_iter": [],
    "tol": [],
    "silhouette_avg": [],
    "purity": [],
    }

for params in grid:
    n_clusters = params['n_clusters']
    init = params['init']
    max_iter = params['max_iter']
    tol = params['tol']
    
    print(f"Evaluating: n_clusters={n_clusters}, init={init}, max_iter={max_iter}, tol={tol}")

    kmeans = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, tol=tol)
    kmeans.fit_predict(X_train_flatten_pca)
    silhouette_avg, purity = evaluate_kmeans(kmeans, X_train_flatten_pca, y_train_flatten)
    
    results_kmeans_respresentation_A['n_clusters'].append(n_clusters)
    results_kmeans_respresentation_A['init'].append(init)
    results_kmeans_respresentation_A['max_iter'].append(max_iter)
    results_kmeans_respresentation_A['tol'].append(tol)
    results_kmeans_respresentation_A['silhouette_avg'].append(silhouette_avg)
    results_kmeans_respresentation_A['purity'].append(purity)
    
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.2623
Cluster Purity: 0.2176

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.2619
Cluster Purity: 0.2184

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.2077
Cluster Purity: 0.2560

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.2079
Cluster Purity: 0.2564

Evaluating: n_clusters=4, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.1603
Cluster Purity: 0.2266

Evaluating: n_clusters=4, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.1563
Cluster Purity: 0.2627

Evaluating: n_clusters=5, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.1266
Cluster Purity: 0.2803

Evaluating: n_clusters=5, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.1560
Cluster Purity: 0.2755

Evaluating: n_clusters=6, init=k-mea

In [18]:
df_kmeans_results_A = pd.DataFrame(results_kmeans_respresentation_A)
df_kmeans_results_A.sort_values(by='purity', ascending=False, inplace=True)
df_kmeans_results_A[:10]

Unnamed: 0,n_clusters,init,max_iter,tol,silhouette_avg,purity
218,23,k-means++,300,0.0001,0.095457,0.43516
612,22,random,1000,0.0001,0.108735,0.434371
219,23,k-means++,300,0.001,0.094267,0.431021
482,23,random,200,0.0001,0.104799,0.430824
437,22,random,100,0.001,0.100449,0.427473
42,23,k-means++,25,0.0001,0.097179,0.426882
394,23,random,50,0.0001,0.094701,0.426882
566,21,random,500,0.0001,0.104975,0.426882
87,23,k-means++,50,0.001,0.093356,0.426488
217,22,k-means++,300,0.001,0.097981,0.426094


# K-Means + Representation B

In [19]:
from sklearn.cluster import KMeans
from sklearn.model_selection import ParameterGrid

# Define parameter grid for KMeans
param_grid = {
    'n_clusters': range(2, 24, 1),  # testing different number of clusters
    'init': ['k-means++', 'random'],  # testing different initialization methods
    'max_iter': [25, 50, 100, 200, 300, 500, 1000],  # different maximum iterations
    'tol': [1e-4, 1e-3],  # different tolerance values
}

# Convert the grid into parameter combinations
grid = ParameterGrid(param_grid)

# Iterate over each parameter combination and evaluate
best_silhouette_avg = -1  # Track the best ARI score
best_params = None  # Track the best hyperparameters
results_kmeans_respresentation_B = {
    "n_clusters": [],
    "init": [],
    "max_iter": [],
    "tol": [],
    "silhouette_avg": [],
    "purity": [],
    }

for params in grid:
    n_clusters = params['n_clusters']
    init = params['init']
    max_iter = params['max_iter']
    tol = params['tol']
    
    print(f"Evaluating: n_clusters={n_clusters}, init={init}, max_iter={max_iter}, tol={tol}")

    kmeans = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, tol=tol)
    kmeans.fit_predict(X_train_resnet)
    silhouette_avg, purity = evaluate_kmeans(kmeans, X_train_resnet, y_train_resnet)
    
    results_kmeans_respresentation_B['n_clusters'].append(n_clusters)
    results_kmeans_respresentation_B['init'].append(init)
    results_kmeans_respresentation_B['max_iter'].append(max_iter)
    results_kmeans_respresentation_B['tol'].append(tol)
    results_kmeans_respresentation_B['silhouette_avg'].append(silhouette_avg)
    results_kmeans_respresentation_B['purity'].append(purity)
    print()
    
    if silhouette_avg > best_silhouette_avg:
        best_silhouette_avg = silhouette_avg
        best_params = params

print(f"\nBest Silhouette: {best_silhouette_avg} with parameters: {best_params}")

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.1159
Cluster Purity: 0.3179

Evaluating: n_clusters=2, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.1156
Cluster Purity: 0.3183

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.1055
Cluster Purity: 0.4428

Evaluating: n_clusters=3, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.0650
Cluster Purity: 0.4011

Evaluating: n_clusters=4, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.0787
Cluster Purity: 0.5203

Evaluating: n_clusters=4, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.0951
Cluster Purity: 0.5286

Evaluating: n_clusters=5, init=k-means++, max_iter=25, tol=0.0001
Silhouette Coefficient: 0.0796
Cluster Purity: 0.5877

Evaluating: n_clusters=5, init=k-means++, max_iter=25, tol=0.001
Silhouette Coefficient: 0.0796
Cluster Purity: 0.5877

Evaluating: n_clusters=6, init=k-mea

In [27]:
df_kmeans_results_B = pd.DataFrame(results_kmeans_respresentation_B)
df_kmeans_results_B.sort_values(by='purity', ascending=False, inplace=True)
df_kmeans_results_B[:10]

Unnamed: 0,n_clusters,init,max_iter,tol,silhouette_avg,purity
79,19,k-means++,50,0.001,0.044988,0.849626
86,23,k-means++,50,0.0001,0.04601,0.84864
344,20,random,25,0.0001,0.045894,0.847261
521,20,random,300,0.001,0.048387,0.846669
299,19,k-means++,1000,0.001,0.046628,0.846275
389,20,random,50,0.001,0.045995,0.845684
257,20,k-means++,500,0.001,0.046335,0.845487
520,20,random,300,0.0001,0.051625,0.844501
606,19,random,1000,0.0001,0.045897,0.843319
610,21,random,1000,0.0001,0.045745,0.842925


# SVM + Representation B

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

svm = SVC(verbose=1, probability=True)

param_grid = {
    'C': [0.1, 0.5, 1, 2, 3, 5, 7, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# GridSearchCV to find the best parameters
grid_search = GridSearchCV(svm, param_grid, cv=4, n_jobs=4, scoring="f1_weighted", verbose=1)
grid_search.fit(X_train_resnet, y_train_resnet)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Fitting 4 folds for each of 32 candidates, totalling 128 fits
[LibSVM]Best parameters: {'C': 7, 'kernel': 'rbf'}
Best cross-validation score: 0.9415768195580522
