In [1]:
"""
This file generates tables that show how the upper bound can enable early sopping of ASW based optimisation loops.

Notes: 
    - The unlabeled datasets are available at https://archive.ics.uci.edu/
    - The labeled datasets are available at https://github.com/deric/clustering-benchmark/tree/master/src/main/resources/datasets/real-world 
"""

'\nThis file generates tables that show how the upper bound can enable early sopping of ASW based optimisation loops.\n\nNotes: \n    - The unlabeled datasets are available at https://archive.ics.uci.edu/\n    - The labeled datasets are available at https://github.com/deric/clustering-benchmark/tree/master/src/main/resources/datasets/real-world \n'

In [2]:
import os 
import numpy as np
import utils 

In [3]:
logger = utils.get_logger(__name__)

In [4]:
def table_row(data: np.ndarray, dataset: str, k_range: range = range(2, 16), epsilon: float = 0.15, metric: str = "euclidean"):

    logger.info(f"\nDistance metric: {metric}")

    n = data.shape[0]

    if n <= 300:
        k_range = range(2, 31)
        epsilon = 0.35

    ub_dict = utils.get_upper_bound(data=data, metric=metric)

    dissimilarity_matrix = utils.data_to_distance_matrix(data=data, metric=metric)

    kmeans_dict = utils.asw_optimization(algorithm=utils.algorithm_kmeans,
                        data=data,
                        k_range=k_range,
                        asw_metric=metric,
                        ub_reference=ub_dict["ub"],
                        epsilon=epsilon
                    )

    kmedoids_dict = utils.asw_optimization(algorithm=utils.algorithm_kmedoids,
                                             data=dissimilarity_matrix,
                                             k_range=k_range,
                                             asw_metric="precomputed",
                                             ub_reference=ub_dict["ub"],
                                             epsilon=epsilon
                                             )
    
    kmeans_wcre = (ub_dict["ub"] - kmeans_dict["best_score"]) / ub_dict["ub"]
    kmedoids_wcre = (ub_dict["ub"] - kmedoids_dict["best_score"]) / ub_dict["ub"]

    return [
        dataset,
        str(k_range),
        str(epsilon),
        f"${kmeans_dict['best_score']:.3f}$",
        f"${kmedoids_dict['best_score']:.3f}$",
        ub_dict["ub"],
        f"{len(utils.Counter(kmeans_dict['best_labels']))}",
        f"{len(utils.Counter(kmedoids_dict['best_labels']))}",
        kmeans_wcre,
        kmedoids_wcre,
        str(kmeans_dict["stopped_early"]),
        str(kmedoids_dict["stopped_early"]),
    ]

In [5]:
def table(dataset_list: list, data_type: str = "unlabeled"):
    """
    Print table in terminal.
    """

    headers = [
        "Dataset",
        "K cand.",
        "epsilon",
        "Best ASW Kmeans",
        "Best ASW Kmedoids",
        "UB",
        "Best K Kmeans",
        "Best K Kmedoids",
        "Worst case relative error Kmeans",
        "Worst case relative error Kmedoids",
        "Early stop Kmeans",
        "Early stop Kmeans",
    ]

    lines = []

    # Format header
    header_line = "| " + " | ".join(headers) + " |"
    lines.append(header_line)
    separator = "| " + " | ".join(["---"] * len(headers)) + " |"
    lines.append(separator)

    for dataset in dataset_list:

        if data_type == "unlabeled":
            if dataset == "conference_papers":
                data = utils.load_unlabeled_data(dataset=dataset, transpose=True)
            else:
                data = utils.load_unlabeled_data(dataset=dataset)
            row = table_row(data=data, dataset=dataset)

        elif data_type == "labeled":
            data = dataset["X"]
            row = table_row(data=data, dataset=dataset["name"])

        lines.append(
            " & ".join(
                f"${cell:.3f}$" if type(cell) is not str else f"{cell}" for cell in row
            )
            + " \\\ "
        )

    # Print table to terminal
    print("\nTABLE\n")
    for line in lines:
        print(line)

In [6]:
# -------------------------------------------------
# Unlabeled Datasets
# -------------------------------------------------
dataset_list = [
        "religious_texts",
        "ceramic", 
        "conference_papers", 
        "rna", 
    ]

table(dataset_list=dataset_list)

2025-09-03 15:39:20 | utils | INFO | ==== Running dataset: religious_texts ====

2025-09-03 15:39:20 | utils | INFO | Data shape: (590, 8266)
2025-09-03 15:39:20 | utils | INFO | Data shape (zeros removed): (589, 8266)
2025-09-03 15:39:20 | __main__ | INFO | 
Distance metric: euclidean
2025-09-03 15:39:20 | utils | INFO | Computing upper bound
2025-09-03 15:39:20 | utils | INFO | UB: 0.8463322667605195
2025-09-03 15:39:21 | utils | INFO | Optimizing ASW
100%|██████████| 14/14 [00:02<00:00,  4.82it/s]
2025-09-03 15:39:23 | utils | INFO | Optimizing ASW
  0%|          | 0/14 [00:00<?, ?it/s]2025-09-03 15:39:26 | utils | INFO | Stopping early!
  0%|          | 0/14 [00:02<?, ?it/s]
2025-09-03 15:39:26 | utils | INFO | ==== Running dataset: ceramic ====

2025-09-03 15:39:26 | utils | INFO | Data shape: (88, 17)
2025-09-03 15:39:26 | utils | INFO | Data shape (zeros removed): (88, 17)
2025-09-03 15:39:26 | __main__ | INFO | 
Distance metric: euclidean
2025-09-03 15:39:26 | utils | INFO | Co


TABLE

| Dataset | K cand. | epsilon | Best ASW Kmeans | Best ASW Kmedoids | UB | Best K Kmeans | Best K Kmedoids | Worst case relative error Kmeans | Worst case relative error Kmedoids | Early stop Kmeans | Early stop Kmeans |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
religious_texts & range(2, 16) & 0.15 & $0.428$ & $0.846$ & $0.846$ & 3 & 2 & $0.495$ & $0.000$ & False & True \\ 
ceramic & range(2, 31) & 0.35 & $0.584$ & $0.584$ & $0.855$ & 2 & 2 & $0.317$ & $0.317$ & True & True \\ 
conference_papers & range(2, 16) & 0.15 & $0.067$ & $0.384$ & $0.439$ & 2 & 2 & $0.847$ & $0.126$ & False & True \\ 
rna & range(2, 16) & 0.15 & $0.228$ & $0.230$ & $0.419$ & 6 & 8 & $0.457$ & $0.452$ & False & False \\ 





In [7]:
# -------------------------------------------------
# Labeled Datasets
# -------------------------------------------------
dataset_dir = "data/labeled/real_world"

datasets = []
for fname in os.listdir(dataset_dir):
    if fname.endswith(".arff"):
        try:
            path = os.path.join(dataset_dir, fname)
            _, X, _ = utils.load_arff_as_distance_matrix(path, scale=True)

            datasets.append({
                "name": fname.replace(".arff", ""),
                "X": X,
            })
        except:
            continue

dataset_list = datasets[:20]  # pick a subset

logger.info("Datasets processed!")

2025-09-03 16:01:39 | __main__ | INFO | Datasets processed!


In [8]:
table(dataset_list=dataset_list, data_type="labeled")

2025-09-03 16:01:39 | __main__ | INFO | 
Distance metric: euclidean
2025-09-03 16:01:39 | utils | INFO | Computing upper bound
2025-09-03 16:01:39 | utils | INFO | UB: 0.7079522637070917
2025-09-03 16:01:39 | utils | INFO | Optimizing ASW
100%|██████████| 14/14 [00:00<00:00, 59.19it/s]
2025-09-03 16:01:39 | utils | INFO | Optimizing ASW
  0%|          | 0/14 [00:00<?, ?it/s]2025-09-03 16:01:42 | utils | INFO | Stopping early!
  0%|          | 0/14 [00:02<?, ?it/s]
2025-09-03 16:01:42 | __main__ | INFO | 
Distance metric: euclidean
2025-09-03 16:01:42 | utils | INFO | Computing upper bound
2025-09-03 16:01:42 | utils | INFO | UB: 0.8538590675108088
2025-09-03 16:01:42 | utils | INFO | Optimizing ASW
100%|██████████| 14/14 [00:00<00:00, 59.51it/s]
2025-09-03 16:01:42 | utils | INFO | Optimizing ASW
  0%|          | 0/14 [00:00<?, ?it/s]2025-09-03 16:01:43 | utils | INFO | Stopping early!
  0%|          | 0/14 [00:00<?, ?it/s]
2025-09-03 16:01:43 | __main__ | INFO | 
Distance metric: eucl


TABLE

| Dataset | K cand. | epsilon | Best ASW Kmeans | Best ASW Kmedoids | UB | Best K Kmeans | Best K Kmedoids | Worst case relative error Kmeans | Worst case relative error Kmedoids | Early stop Kmeans | Early stop Kmeans |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
wdbc & range(2, 16) & 0.15 & $0.345$ & $0.661$ & $0.708$ & 2 & 2 & $0.513$ & $0.067$ & False & True \\ 
ecoli & range(2, 16) & 0.15 & $0.398$ & $0.836$ & $0.854$ & 4 & 2 & $0.534$ & $0.021$ & False & True \\ 
wine & range(2, 31) & 0.35 & $0.285$ & $0.295$ & $0.626$ & 3 & 5 & $0.545$ & $0.529$ & False & False \\ 
wisc & range(2, 16) & 0.15 & $0.574$ & $0.574$ & $0.844$ & 2 & 2 & $0.320$ & $0.320$ & False & False \\ 
iono & range(2, 16) & 0.15 & $0.293$ & $0.413$ & $0.691$ & 6 & 2 & $0.575$ & $0.402$ & False & False \\ 
zoo & range(2, 31) & 0.35 & $0.547$ & $0.547$ & $0.839$ & 30 & 22 & $0.348$ & $0.347$ & True & True \\ 
iris & range(2, 31) & 0.35 & $0.580$ & $0.580$ & $0.878$ & 2 & 2 & $0


