# Globals

In [None]:
import copy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wandb

In [None]:
VALIDATION_DATASETS = ["imagenet", "imagenette", "imagewoof"]
RESNET50_MODELS = [
    "resnet50",
    "mocov3_resnet50",
    "vicreg_resnet50",
    "dino_resnet50",
    "clip_RN50",
]
VITB16_MODELS = [
    "vitb16",
    "mocov3_vit_base",
    "timm_vit_base_patch16_224.mae",
    "dino_vitb16",
    "clip_vitb16",
]
CLUSTERERS = [
    "KMeans",
    "AgglomerativeClustering",
    "AffinityPropagation",
    "SpectralClustering",
    "HDBSCAN",
    "OPTICS",
]
ALL_CLUSTERERS = copy.deepcopy(CLUSTERERS)
DISTANCE_METRICS = [
    "euclidean",
    "l1",
    "chebyshev",
    "cosine",
    "arccos",
    "braycurtis",
    "canberra",
]

In [None]:
DATASET2LS = {
    "imagenet": "-.",
    "imagenette": "--",
    "imagewoof": ":",
}

In [None]:
DEFAULT_PARAMS = {
    "all": {
        "dim_reducer": "None",
        "dim_reducer_man": "None",
        "zscore": False,
        "normalize": False,
        "zscore2": False,
        "ndim_correction": False,
    },
    "KMeans": {"clusterer": "KMeans"},
    "AffinityPropagation": {
        "clusterer": "AffinityPropagation",
        "affinity_damping": 0.5,
        "affinity_conv_iter": 15,
    },
    "SpectralClustering": {
        "clusterer": "SpectralClustering",
        "spectral_assigner": "kmeans",
    },
    "AgglomerativeClustering": {
        "clusterer": "AgglomerativeClustering",
        "distance_metric": "euclidean",
        "aggclust_linkage": "ward",
    },
    "HDBSCAN": {
        "clusterer": "HDBSCAN",
        "hdbscan_method": "eom",
        "min_samples": 5,
        "max_samples": 0.2,
        "distance_metric": "euclidean",
    },
    "OPTICS": {
        "clusterer": "OPTICS",
        "optics_method": "xi",
        "optics_xi": 0.05,
        "distance_metric": "euclidean",
    },
}

### Utility functions

In [None]:
import matplotlib.colors

def categorical_cmap(nc, nsc, cmap="tab10", continuous=False):
    """
    Create a colormap with a certain number of shades of colours.

    https://stackoverflow.com/a/47232942/1960959
    """
    if nc > plt.get_cmap(cmap).N:
        raise ValueError("Too many categories for colormap.")
    if continuous:
        ccolors = plt.get_cmap(cmap)(np.linspace(0, 1, nc))
    else:
        ccolors = plt.get_cmap(cmap)(np.arange(nc, dtype=int))
    cols = np.zeros((nc * nsc, 3))
    for i, c in enumerate(ccolors):
        chsv = matplotlib.colors.rgb_to_hsv(c[:3])
        arhsv = np.tile(chsv, nsc).reshape(nsc, 3)
        arhsv[:, 1] = np.linspace(chsv[1], 0.25, nsc)
        arhsv[:, 2] = np.linspace(chsv[2], 1, nsc)
        rgb = matplotlib.colors.hsv_to_rgb(arhsv)
        cols[i * nsc : (i + 1) * nsc, :] = rgb
    cmap = matplotlib.colors.ListedColormap(cols)
    return cmap

In [None]:
categorical_cmap(len(RESNET50_MODELS), len(VALIDATION_DATASETS))

In [None]:
def select_rows(df, filters, allow_missing=True):
    select = np.ones(len(df), dtype=bool)
    for col, val in filters.items():
        if col == "dataset":
            col = "dataset_name"
        if col == "clusterer":
            col = "clusterer_name"
        if val is None or val == "None":
            select_i = pd.isna(df[col])
            select_i |= df[col] == "None"
        else:
            select_i = df[col] == val
            select_i |= df[col] == str(val)
            if allow_missing or val == "None":
                select_i |= pd.isna(df[col])
        select &= select_i
    return df[select]

In [None]:
def find_differing_columns(df, cols=None):
    if cols is None:
        cols = df.columns
    my_cols = []
    for col in cols:
        if col not in df.columns:
            continue
        if df[col].nunique(dropna=False) > 1:
            my_cols.append(col)
    return my_cols

In [None]:
def filter2command(*filters, partition="val"):
    f = {}
    for filter in filters:
        for k, v in filter.items():
            f[k] = v
    dataset = f.get("dataset", "")
    clusterer = f.get("clusterer", "")
    mem = 4
    if dataset != "imagenet":
        pass
    elif clusterer == "AgglomerativeClustering":
        mem = 20
    if partition == "val":
        seed = 100
    elif partition == "test":
        seed = 1
    else:
        seed = 0
    s = (
        f"sbatch --array={seed} --mem={mem}G"
        f' --job-name="zsc-{f.get("model", "")}-{dataset}-{clusterer}"'
        f" slurm/cluster.slrm --partition={partition}"
    )
    for k, v in f.items():
        if v is None:
            continue
        if k == "zscore":
            if v == "False" or not v:
                s += " --no-zscore"
            elif v == "True" or v:
                s += " --zscore"
            continue
        if k == "normalize":
            if v == "False" or not v:
                pass
            elif v == "True" or v:
                s += " --normalize"
            continue
        if k == "zscore2":
            if v == "False" or not v:
                s += " --no-zscore2"
            elif v == "average":
                s += " --azscore2"
            elif v == "standard" or v:
                s += " --zscore2"
            continue
        if k == "ndim_correction":
            if v == "False" or not v:
                s += " --no-ndim-correction"
            elif v == "True" or v:
                s += " --ndim-correction"
            continue
        s += f" --{k.replace('_', '-')}={v}"
    return s

# Hyperparameter searches on val data

### Fetch results

In [None]:
# Project is specified by <entity/project-name>
api = wandb.Api()
runs = api.runs(
    "uoguelph_mlrg/zs-ssl-clustering",
    filters={"state": "Finished", "config.partition": "val"},
)
len(runs)

In [None]:
summary_list, config_list, name_list = [], [], []
for run in runs:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)
    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})
    # .name is the human-readable name of the run.
    name_list.append(run.name)

runs_df = pd.DataFrame(
    {"summary": summary_list, "config": config_list, "name": name_list}
)

rows = []
config_keys = set()
summary_keys = set()
for summary, config, name in zip(summary_list, config_list, name_list):
    row = {"name": name}
    row.update({k: v for k, v in config.items() if not k.startswith("_")})
    row.update({k: v for k, v in summary.items() if not k.startswith("_")})
    if "_timestamp" in summary:
        row["_timestamp"] = summary["_timestamp"]
    rows.append(row)
    config_keys = config_keys.union(config.keys())
    summary_keys = summary_keys.union(summary.keys())

runs_df = pd.DataFrame.from_records(rows)
print(len(runs_df))

# Handle changed default value for spectral_assigner after config arg was introduced
if "spectral_assigner" not in runs_df.columns:
    runs_df["spectral_assigner"] = None
select = runs_df["clusterer_name"] != "SpectralClustering"
runs_df.loc[select, "spectral_assigner"] = None
select = (runs_df["clusterer_name"] == "SpectralClustering") & pd.isna(
    runs_df["spectral_assigner"]
)
runs_df.loc[select, "spectral_assigner"] = "kmeans"

if "zscore2" not in runs_df.columns:
    runs_df["zscore2"] = False
runs_df.loc[pd.isna(runs_df["zscore2"]), "zscore2"] = False

if "ndim_correction" not in runs_df.columns:
    runs_df["ndim_correction"] = False
runs_df.loc[pd.isna(runs_df["ndim_correction"]), "ndim_correction"] = False

In [None]:
config_keys = config_keys.difference(
    {"workers", "memory_avail_GB", "memory_total_GB", "memory_slurm"}
)

In [None]:
# Remove entries without an AMI metric
runs_df = runs_df[~runs_df["AMI"].isna()]
len(runs_df)

In [None]:
runs_df

In [None]:
config_keys

In [None]:
summary_keys

## EDA

In [None]:
sdf = select_rows(
    runs_df,
    {
        "model": "resnet50",
        "dataset": "imagenette",
        "clusterer": "KMeans",
        "dim_reducer": "PCA",
    },
    allow_missing=False,
)
sdf

In [None]:
sdf["reduced_dim"]

In [None]:
sdf = sdf.sort_values("reduced_dim")
plt.plot(sdf["reduced_dim"], sdf["AMI"])
plt.xscale("log")

In [None]:
plt.plot(sdf["pca_explained_ratio"], sdf["AMI"])

In [None]:
sdf = sdf.sort_values("reduced_dim")
plt.plot(sdf["reduced_dim"], sdf["pca_explained_ratio"])
plt.xlabel("Dimension")
plt.ylabel("Variance explained")
plt.title("imagenette: resnet50")

In [None]:
cmap = categorical_cmap(len(RESNET50_MODELS), len(VALIDATION_DATASETS))

In [None]:
cmap

## PCA dim vs variance

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "KMeans"
plt.figure(figsize=(10, 8))
i = 0
for model in models:
    for dataset in VALIDATION_DATASETS:
        filter = {
            "model": model,
            "dataset": dataset,
            "clusterer": clusterer,
            "dim_reducer": "PCA",
            "zscore": True,
        }
        sdf = select_rows(runs_df, filter, allow_missing=False)
        filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
        filter2 = {k: v for k, v in filter2.items() if k not in filter}
        sdf = select_rows(sdf, filter2, allow_missing=True)
        sdf = sdf.sort_values("reduced_dim")
        plt.plot(
            sdf["reduced_dim"],
            sdf["pca_explained_ratio"],
            label=f"{dataset}: {model}",
            c=cmap(i),
        )
        i += 1

plt.xlabel("Number of dimensions")
plt.ylabel("Cummulative variance fraction explained")
plt.legend()
plt.show()

## Dim count (loose)

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
for clusterer in CLUSTERERS:
    plt.figure(figsize=(10, 8))
    i = 0
    for model in models:
        for dataset in VALIDATION_DATASETS:
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "dim_reducer": "PCA",
                "zscore": True,
            }
            sdf = select_rows(runs_df, filter, allow_missing=False)
            filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=True)
            sdf = sdf.sort_values("reduced_dim")
            plt.plot(
                sdf["reduced_dim"],
                sdf["AMI"],
                label=f"{dataset}: {model}",
                c=cmap(i),
            )
            i += 1

    plt.xlabel("Dimension")
    plt.ylabel("AMI")
    plt.title(clusterer)
    plt.xscale("log")
    plt.legend()
    plt.show()

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
for clusterer in CLUSTERERS:
    plt.figure(figsize=(10, 8))
    i = 0
    for model in models:
        for dataset in VALIDATION_DATASETS:
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "dim_reducer": "PCA",
                "zscore": True,
            }
            sdf = select_rows(runs_df, filter, allow_missing=False)
            filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=True)
            sdf = sdf.sort_values("pca_explained_ratio")
            plt.plot(
                sdf["pca_explained_ratio"],
                sdf["AMI"],
                label=f"{dataset}: {model}",
                c=cmap(i),
            )
            i += 1

    plt.xlabel("Kept variance explained ratio")
    plt.ylabel("AMI")
    plt.title(clusterer)
    plt.legend()
    plt.show()

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
for clusterer in CLUSTERERS:
    plt.figure(figsize=(10, 8))
    i = 0
    for model in models:
        for dataset in VALIDATION_DATASETS:
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "dim_reducer_man": "UMAP",
            }
            sdf = select_rows(runs_df, filter, allow_missing=False)
            sdf = select_rows(
                sdf, {"dim_reducer_man_metric": "euclidean"}, allow_missing=True
            )
            filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=True)
            sdf = sdf.sort_values("reduced_dim")
            plt.plot(
                sdf["reduced_dim"],
                sdf["AMI"],
                label=f"{dataset}: {model}",
                c=cmap(i),
            )
            i += 1

    plt.xlabel("Dimensions")
    plt.ylabel("AMI")
    plt.title(clusterer)
    plt.xscale("log")
    plt.legend()
    plt.show()

## HDBSCAN

### Max samples

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "HDBSCAN"
plt.figure(figsize=(10, 8))
i = -1
for model in models:
    for dataset in VALIDATION_DATASETS:
        i += 1
        filter = {
            "model": model,
            "dataset": dataset,
            "clusterer": clusterer,
        }
        sdf = select_rows(runs_df, filter, allow_missing=False)
        filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
        filter2 = {k: v for k, v in filter2.items() if k not in filter}
        filter2 = {k: v for k, v in filter2.items() if k not in ["max_samples"]}
        sdf = select_rows(sdf, filter2, allow_missing=True)
        sdf = sdf.sort_values("max_samples")
        if len(sdf) > 0 and sum(~pd.isna(sdf["max_samples"])) > 0:
            plt.plot(
                sdf["max_samples"],
                sdf["AMI"],
                label=f"{dataset}: {model}",
                c=cmap(i),
            )

plt.xlabel("Max samples per cluster")
plt.ylabel("AMI")
plt.title(clusterer)
plt.legend()
plt.show()

### EOM vs leaf and distance metric

In [None]:
cmap = categorical_cmap(len(RESNET50_MODELS), len(VALIDATION_DATASETS))
clusterer = "HDBSCAN"
methods = ["eom", "leaf"]
metrics = DISTANCE_METRICS

data = np.NaN * np.ones(
    (len(RESNET50_MODELS), len(VALIDATION_DATASETS), len(methods), len(metrics))
)
cmds = []
for i_model, model in enumerate(RESNET50_MODELS):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_method, method in enumerate(methods):
            for i_metric, metric in enumerate(metrics):
                if metric == "cosine":
                    continue
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    "distance_metric": metric,
                    "hdbscan_method": method,
                }
                if method == "eom":
                    filter["max_samples"] = 0.25
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=True)
                if len(sdf) < 1:
                    print("No data for", filter)
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter} and {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        print()
                data[i_model, i_dataset, i_method, i_metric] = np.median(sdf["AMI"])

for cmd in cmds:
    print(cmd)

In [None]:
np.mean(np.mean(data, axis=1), axis=0)

In [None]:
np.nanmean(np.nanmean(data, axis=1), axis=0)

In [None]:
data[0, 0, 0, :]

In [None]:
max_data = np.nanmax(data)
YLIM = [-0.05 * max_data, 1.05 * max_data]
for i_method, method in enumerate(methods):
    plt.figure(figsize=(10, 8))
    ax = plt.axes()
    i = 0
    for i_model, model in enumerate(RESNET50_MODELS):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            plt.plot(
                data[i_model, i_dataset, i_method, :],
                "x",
                label=f"{dataset}: {model}",
                c=cmap(i),
            )
            i += 1
    # plt.legend()
    ax.set_xticks(np.arange(len(metrics)), metrics)
    plt.ylim(YLIM)
    plt.title(method)
    plt.show()

In [None]:
width = 0.05
max_data = np.nanmax(data)
YLIM = [-0.05 * max_data, 1.05 * max_data]
for i_method, method in enumerate(methods):
    plt.figure(figsize=(15, 8))
    ax = plt.axes()
    i = 0
    for i_model, model in enumerate(RESNET50_MODELS):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            plt.bar(
                np.arange(len(metrics)) + i * width,
                data[i_model, i_dataset, i_method, :],
                width=width,
                label=f"{dataset}: {model}",
                color=cmap(i),
            )
            i += 1
    # plt.legend()
    ax.set_xticks(np.arange(len(metrics)) + width * (i + 1) / 2, metrics)
    plt.ylim(YLIM)
    plt.title(method)
    plt.show()

## Dimensionality selection (proper)

In [None]:
CLUSTERERS = [
    "KMeans",
    "AgglomerativeClustering",
    "AffinityPropagation",
    "SpectralClustering",
    "HDBSCAN",
    "OPTICS",
]

In [None]:
DEFAULT_PARAMS["HDBSCAN"]["max_samples"] = 0.2

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
axis_values = [2, 5, 10, 20, 50, 100, 200, 500]

data_pca = np.NaN * np.ones(
    (len(CLUSTERERS), len(models), len(VALIDATION_DATASETS), len(axis_values))
)

cmds = []
for i_clusterer, clusterer in enumerate(CLUSTERERS):
    for i_model, model in enumerate(models):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            for i_value, axis_value in enumerate(axis_values):
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    "aggclust_dist_thresh": None,
                    "dim_reducer": "PCA",
                    "zscore": True,
                    "ndim_reduced": axis_value,
                }
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    if dataset == "imagenet" and clusterer in [
                        "AffinityPropagation",
                        "SpectralClustering",
                    ]:
                        continue
                    if clusterer in ["SpectralClustering"]:
                        continue
                    print("No data for", filter)
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter} and {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        print()
                data_pca[i_clusterer, i_model, i_dataset, i_value] = np.median(
                    sdf["AMI"]
                )

for cmd in cmds:
    print(cmd)

In [None]:
axis_values = pca_var_values = [0.75, 0.8, 0.85, 0.90, 0.95, 0.98, 0.99]

data_pca_var = np.NaN * np.ones(
    (len(CLUSTERERS), len(models), len(VALIDATION_DATASETS), len(axis_values))
)

cmds = []
for i_clusterer, clusterer in enumerate(CLUSTERERS):
    for i_model, model in enumerate(models):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            for i_value, axis_value in enumerate(axis_values):
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    "aggclust_dist_thresh": None,
                    "dim_reducer": "PCA",
                    "zscore": True,
                    "pca_variance": axis_value,
                }
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    if dataset == "imagenet" and clusterer in [
                        "AffinityPropagation",
                        "SpectralClustering",
                    ]:
                        continue
                    if clusterer in ["SpectralClustering"]:
                        continue
                    print("No data for", filter)
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter} and {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        print()
                data_pca_var[i_clusterer, i_model, i_dataset, i_value] = np.median(
                    sdf["AMI"]
                )

for cmd in cmds:
    print(cmd)

In [None]:
axis_values = [2, 5, 10, 20, 50, 100, 200, 500]

data_umap = np.NaN * np.ones(
    (len(CLUSTERERS), len(models), len(VALIDATION_DATASETS), len(axis_values))
)

cmds = []
for i_clusterer, clusterer in enumerate(CLUSTERERS):
    for i_model, model in enumerate(models):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            for i_value, axis_value in enumerate(axis_values):
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    "aggclust_dist_thresh": None,
                    "dim_reducer_man": "UMAP",
                    "ndim_reduced_man": axis_value,
                }
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    if dataset == "imagenet" and clusterer in [
                        "AffinityPropagation",
                        "SpectralClustering",
                    ]:
                        continue
                    if clusterer in ["SpectralClustering"]:
                        continue
                    print("No data for", filter)
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter} and {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        print()
                data_umap[i_clusterer, i_model, i_dataset, i_value] = np.median(
                    sdf["AMI"]
                )

for cmd in cmds:
    print(cmd)

In [None]:
data_base = np.NaN * np.ones(
    (len(CLUSTERERS), len(models), len(VALIDATION_DATASETS), 2)
)

cmds = []
for i_clusterer, clusterer in enumerate(CLUSTERERS):
    for i_model, model in enumerate(models):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            # no z-score
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "aggclust_dist_thresh": None,
                "zscore": False,
                "dim_reducer": "None",
                "dim_reducer_man": "None",
            }
            sdf = select_rows(runs_df, filter, allow_missing=True)
            filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=False)
            if len(sdf) < 1:
                if dataset == "imagenet" and clusterer in [
                    "AffinityPropagation",
                    "SpectralClustering",
                ]:
                    continue
                if clusterer in ["SpectralClustering"]:
                    continue
                print("No data for", filter)
                cmds.append(filter2command(filter, filter2))
                continue
            if len(sdf) > 1:
                if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                    print()
                    print("More than one result with AMIs:", list(sdf["AMI"]))
                    print(f"for search {filter} and {filter2}")
                    dif_cols = find_differing_columns(sdf, config_keys)
                    print(f"columns which differ: {dif_cols}")
                    print()
            data_base[i_clusterer, i_model, i_dataset, 0] = np.median(sdf["AMI"])
            # z-score
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "aggclust_dist_thresh": None,
                "zscore": True,
                "dim_reducer": "None",
                "dim_reducer_man": "None",
            }
            sdf = select_rows(runs_df, filter, allow_missing=False)
            filter2 = dict(DEFAULT_PARAMS["all"], **DEFAULT_PARAMS[clusterer])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=False)
            if len(sdf) < 1:
                if dataset == "imagenet" and clusterer in [
                    "AffinityPropagation",
                    "SpectralClustering",
                ]:
                    continue
                if clusterer in ["SpectralClustering"]:
                    continue
                print("No data for", filter)
                cmds.append(filter2command(filter, filter2))
                continue
            if len(sdf) > 1:
                if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                    print()
                    print("More than one result with AMIs:", list(sdf["AMI"]))
                    print(f"for search {filter} and {filter2}")
                    dif_cols = find_differing_columns(sdf, config_keys)
                    print(f"columns which differ: {dif_cols}")
                    print()
            data_base[i_clusterer, i_model, i_dataset, 1] = np.median(sdf["AMI"])

for cmd in cmds:
    print(cmd)

In [None]:
np.nanmax(data_pca)

In [None]:
np.nanmax(data_umap)

In [None]:
np.nanmax(data_base)

In [None]:
data_pca.shape

In [None]:
# Weight imagenet twice as much as imagenette and imagewoof
# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

cmap = categorical_cmap(4, len(VALIDATION_DATASETS))
width = 0.2

# data_pca[i_clusterer, i_model, i_dataset, i_value]
best_pca = np.nanmax(data_pca, axis=-1)
best_umap = np.nanmax(data_umap, axis=-1)

for i_clusterer, clusterer in enumerate(CLUSTERERS):
    if clusterer in ["SpectralClustering"]:  # , "OPTICS"]:
        continue
    for i_model, model in enumerate(models):
        plt.figure(figsize=(8, 4))
        ax = plt.axes()
        my_data = np.NaN * np.ones((len(VALIDATION_DATASETS), 4))
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            i = 0
            plt.bar(
                i_dataset + i * width,
                data_base[i_clusterer, i_model, i_dataset, 0],
                width=width,
                label=f"{dataset}: original embeddings",
                color=cmap(i * 3 + i_dataset),
            )
            my_data[i_dataset, i] = data_base[i_clusterer, i_model, i_dataset, 0]
            i += 1
            plt.bar(
                i_dataset + i * width,
                data_base[i_clusterer, i_model, i_dataset, 1],
                width=width,
                label=f"{dataset}: zscored embeddings",
                color=cmap(i * 3 + i_dataset),
            )
            my_data[i_dataset, i] = data_base[i_clusterer, i_model, i_dataset, 1]
            i += 1
            plt.bar(
                i_dataset + i * width,
                best_pca[i_clusterer, i_model, i_dataset],
                width=width,
                label=f"{dataset}: PCA (best)",
                color=cmap(i * 3 + i_dataset),
            )
            my_data[i_dataset, i] = best_pca[i_clusterer, i_model, i_dataset]
            i += 1
            plt.bar(
                i_dataset + i * width,
                best_umap[i_clusterer, i_model, i_dataset],
                width=width,
                label=f"{dataset}: UMAP (best)",
                color=cmap(i * 3 + i_dataset),
            )
            my_data[i_dataset, i] = best_umap[i_clusterer, i_model, i_dataset]

        my_weights = weights_val
        if clusterer in ["AffinityPropagation", "SpectralClustering"]:
            # Disregard imagenet results as it has too many samples to run
            my_data = my_data[1:]
            my_weights = my_weights[1:]

        my_data = np.average(my_data, axis=0, weights=my_weights)
        hs = []
        labels = [
            "original embeddings",
            "zscored-embeddings",
            "PCA (best)",
            "UMAP (best)",
        ]
        i = 0
        hs.append(
            plt.bar(
                1 + i_dataset + i * width,
                my_data[i],
                width=width,
                label=labels[i],
                color=cmap(i * 3),
            )
        )
        i += 1
        hs.append(
            plt.bar(
                1 + i_dataset + i * width,
                my_data[i],
                width=width,
                label=labels[i],
                color=cmap(i * 3),
            )
        )
        i += 1
        hs.append(
            plt.bar(
                1 + i_dataset + i * width,
                my_data[i],
                width=width,
                label=labels[i],
                color=cmap(i * 3),
            )
        )
        i += 1
        hs.append(
            plt.bar(
                1 + i_dataset + i * width,
                my_data[i],
                width=width,
                label=labels[i],
                color=cmap(i * 3),
            )
        )
        ax.set_xticks(
            np.arange(len(VALIDATION_DATASETS) + 1) + width * i / 2,
            VALIDATION_DATASETS + ["mean"],
        )
        plt.ylabel("AMI")
        try:
            best_option = labels[np.nanargmax(my_data)]
        except Exception:
            best_option = "n/a"
        plt.title(f"{clusterer}, {model} [{best_option}]")
        plt.ylim([-0.05, 1.05])
        # plt.legend(handles=hs)
        plt.show()

In [None]:
dim_choices_rows = []
eps = 1e-3
# Weight imagenet twice as much as imagenette and imagewoof
# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

# cmap = categorical_cmap(len(models), len(CLUSTERERS))
cmap = categorical_cmap(4, len(VALIDATION_DATASETS))

axis_values = [2, 5, 10, 20, 50, 100, 200, 500]
# axis_values = [2, 5, 10, 20, 50, 100, 200]

i = 0
for i_clusterer, clusterer in enumerate(CLUSTERERS):
    if clusterer in ["SpectralClustering"]:
        continue
    for i_model, model in enumerate(models):
        i += 1
        my_data_p = data_pca[i_clusterer, i_model]  # [:, :-1]
        my_data_u = data_umap[i_clusterer, i_model]  # [:, :-1]
        my_data_pvar = data_pca_var[i_clusterer, i_model]
        my_data_base = data_base[i_clusterer, i_model]
        plt.figure()
        # indiv
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            if dataset == "imagenet" and clusterer in [
                "AffinityPropagation",
                "SpectralClustering",
            ]:
                # No imagenet results as it has too many samples
                continue
            ls = DATASET2LS.get(dataset) + "o"
            plt.plot(
                axis_values,
                my_data_p[i_dataset],
                ls,
                markersize=5,
                color=cmap(2 * 3 + 2),
            )
            plt.plot(
                axis_values,
                my_data_u[i_dataset],
                ls,
                markersize=5,
                color=cmap(3 * 3 + 2),
            )
        # mean
        if clusterer in ["AffinityPropagation", "SpectralClustering"]:
            # No imagenet results as it has too many samples
            my_data_p = my_data_p[1:]
            my_data_u = my_data_u[1:]
            my_data_pvar = my_data_pvar[1:]
            my_data_base = my_data_base[1:]
            my_weights = weights_val[1:]
        else:
            my_weights = weights_val
        mu_data_p = np.average(my_data_p, axis=0, weights=my_weights)
        mu_data_u = np.average(my_data_u, axis=0, weights=my_weights)
        mu_data_pvar = np.average(my_data_pvar, axis=0, weights=my_weights)
        my_data_base = np.average(my_data_base, axis=0, weights=my_weights)
        plt.plot(axis_values, mu_data_p, "o-", markersize=5, color=cmap(2 * 3))
        plt.plot(axis_values, mu_data_u, "o-", markersize=5, color=cmap(3 * 3))
        if (
            sum(~np.isnan(mu_data_p)) == 0
            or sum(~np.isnan(mu_data_u)) == 0
            or sum(~np.isnan(mu_data_pvar)) == 0
        ):
            plt.title(f"{clusterer}, {model} [MISSING DATA]")
            plt.xscale("log")
            plt.xlabel("Num dimensions")
            plt.ylabel("AMI")
            plt.ylim([-0.05, 1.05])
            plt.show()
            continue
        best_pca_i = np.nanargmax(mu_data_p)
        best_umap_i = np.nanargmax(mu_data_u)
        best_pvar_i = np.nanargmax(mu_data_pvar)
        if mu_data_p[best_pca_i] > mu_data_u[best_umap_i]:
            best_reducer = "PCA"
            best_d = axis_values[best_pca_i]
            best_ami_plot = mu_data_p[best_pca_i]
            plt.plot(best_d, best_ami_plot, "kx")
            row = {
                "clusterer": clusterer,
                "model": model,
                "reducer": "PCA",
                "dim": best_d,
            }
        else:
            best_reducer = "UMAP"
            best_d = axis_values[best_umap_i]
            best_ami_plot = mu_data_u[best_umap_i]
            plt.plot(best_d, best_ami_plot, "kx")
            row = {
                "clusterer": clusterer,
                "model": model,
                "reducer": "UMAP",
                "dim": best_d,
            }
        best_ami = np.nanmax(
            [
                max(my_data_base),
                max(mu_data_p),
                max(mu_data_u),
                max(mu_data_pvar),
            ]
        )
        if best_ami <= best_ami_plot:
            extra_str = "best"
        else:
            extra_str = f"< {best_ami:.3f} from"
            if best_ami == my_data_base[0]:
                extra_str += " full"
                if best_ami >= best_ami_plot + eps:
                    row = {"clusterer": clusterer, "model": model, "reducer": "OG"}
                    extra_str += "*"
            if best_ami == my_data_base[1]:
                extra_str += " fullzscore"
                if best_ami >= best_ami_plot + eps:
                    row = {
                        "clusterer": clusterer,
                        "model": model,
                        "reducer": "zscore-only",
                    }
                    extra_str += "*"
            if best_ami == max(mu_data_pvar):
                extra_str += f" PCA var={pca_var_values[best_pvar_i]}"
                if best_ami >= best_ami_plot + eps:
                    row = {
                        "clusterer": clusterer,
                        "model": model,
                        "reducer": "PCA",
                        "dim": pca_var_values[best_pvar_i],
                    }
                    extra_str += "*"
        plt.title(
            f"{clusterer}, {model}"
            f"  [{best_reducer} {best_d}: AMI={best_ami_plot:.3f} ({extra_str})]"
        )
        plt.xscale("log")
        plt.xlabel("Num dimensions")
        plt.ylabel("AMI")
        plt.ylim([-0.05, 1.05])
        plt.show()
        dim_choices_rows.append(row)

In [None]:
df_dim_choices = pd.DataFrame.from_records(dim_choices_rows)

In [None]:
df_dim_choices

In [None]:
# cmap = categorical_cmap(len(models), len(CLUSTERERS))
cmap = categorical_cmap(4, len(VALIDATION_DATASETS))

i = 0
for i_clusterer, clusterer in enumerate(CLUSTERERS):
    if clusterer in ["SpectralClustering"]:
        continue
    for i_model, model in enumerate(models):
        i += 1
        my_data_pvar = data_pca_var[i_clusterer, i_model]
        plt.figure()
        # indiv
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            if dataset == "imagenet" and clusterer in [
                "AffinityPropagation",
                "SpectralClustering",
            ]:
                # No imagenet results as it has too many samples
                continue
            ls = DATASET2LS.get(dataset) + "o"
            plt.plot(
                pca_var_values,
                my_data_pvar[i_dataset],
                ls,
                markersize=5,
                color=cmap(2 * 3 + 2),
            )
        # mean
        if clusterer in ["AffinityPropagation", "SpectralClustering"]:
            # No imagenet results as it has too many samples
            my_data_pvar = my_data_pvar[1:]
            my_weights = weights_val[1:]
        else:
            my_weights = weights_val
        mu_data_pvar = np.average(my_data_pvar, axis=0, weights=my_weights)
        plt.plot(pca_var_values, mu_data_pvar, "-o", markersize=5, color=cmap(2 * 3))
        plt.title(f"{clusterer}, {model}")
        # plt.xscale("log")
        plt.xlabel("Variance kept")
        plt.ylabel("AMI")
        plt.ylim([-0.05, 1.05])
        plt.show()

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
BEST_PARAMS = {
    clusterer: {model: copy.deepcopy(DEFAULT_PARAMS[clusterer]) for model in models}
    for clusterer in ALL_CLUSTERERS
}

# KMeans
# Use UMAP (num dims unimportant; we select 50d for consistency) for every encoder except
# - clip_RN50 : a little better to use PCA with 500d than UMAP. UMAP beats PCA if you
#   reduce the PCA dims below 500.
# - clip_vitb16 : same behaviour as clip_RN50
# - timm_vit_base_patch16_224.mae : best is PCA 0.85 variance explained. Need at least
#   200 PCA dims, and PCA perf beats UMAP throughout

for model in RESNET50_MODELS + VITB16_MODELS:
    if model.startswith("clip") or model == "timm_vit_base_patch16_224.mae":
        continue
    BEST_PARAMS["KMeans"][model].update(
        {"dim_reducer_man": "UMAP", "ndim_reduced_man": 50}
    )

BEST_PARAMS["KMeans"]["clip_RN50"].update(
    {"dim_reducer": "PCA", "ndim_reduced": 500, "zscore": True, "pca_variance": None}
)
BEST_PARAMS["KMeans"]["clip_vitb16"].update(
    {"dim_reducer": "PCA", "ndim_reduced": 500, "zscore": True, "pca_variance": None}
)
BEST_PARAMS["KMeans"]["timm_vit_base_patch16_224.mae"].update(
    {"dim_reducer": "PCA", "pca_variance": 0.85, "zscore": True, "ndim_reduced": None}
)

# AffinityPropagation
# Use PCA with 10 dims for every encoder except
# - resnet50 (supervised) : original embeddings, no reduction (AMI=0.62);
#   perf gets worse if they are whitened (AMI=0.55) and although the perf increases
#   as num dims are reduced it doesn't quite recover. PCA perf peaks at 10-20 dim (AMI=0.57).
# - dino_resnet50 : does marginally better at UMAP 50 (AMI=0.52495) than PCA 10 (AMI=0.5044)
# - timm_vit_base_patch16_224.mae : PCA 0.95 variance explained (AMI=0.303).
#   Definite improvement from 10 to 20 dims, but not much improvement above that.

for model in models:
    if model in ["resnet50", "dino_resnet50", "timm_vit_base_patch16_224.mae"]:
        continue
    BEST_PARAMS["AffinityPropagation"][model].update(
        {
            "dim_reducer": "PCA",
            "ndim_reduced": 10,
            "zscore": True,
            "pca_variance": None,
            "dim_reducer_man": "None",
        }
    )

BEST_PARAMS["AffinityPropagation"]["resnet50"].update(
    {"dim_reducer": "None", "dim_reducer_man": "None", "zscore": False}
)
BEST_PARAMS["AffinityPropagation"]["dino_resnet50"].update(
    {
        "dim_reducer": "PCA",
        "pca_variance": 0.95,
        "zscore": True,
        "ndim_reduced": None,
        "dim_reducer_man": "None",
    }
)
BEST_PARAMS["AffinityPropagation"]["timm_vit_base_patch16_224.mae"].update(
    {
        "dim_reducer": "PCA",
        "pca_variance": 0.95,
        "zscore": True,
        "ndim_reduced": None,
        "dim_reducer_man": "None",
    }
)

# AgglomerativeClustering
# Use UMAP (num dims unimportant; we select 50d for consistency) for every encoder except
# - timm_vit_base_patch16_224.mae : PCA 0.98 variance explained (i.e. nearly all
#   dimensions kept), which is not noticably better than using 500 dim PCA but there is
#   an increase compared to using less than 500d.

for model in models:
    if model == "timm_vit_base_patch16_224.mae":
        continue
    BEST_PARAMS["AgglomerativeClustering"][model].update(
        {"dim_reducer_man": "UMAP", "ndim_reduced_man": 50, "dim_reducer": "None"}
    )

BEST_PARAMS["AgglomerativeClustering"]["timm_vit_base_patch16_224.mae"].update(
    {
        "dim_reducer": "PCA",
        "pca_variance": 0.98,
        "zscore": True,
        "ndim_reduced": None,
        "dim_reducer_man": "None",
    }
)

# HDBSCAN
# Use UMAP for every encoder except
# - timm_vit_base_patch16_224.mae : PCA 0.95 variance explained (AMI=0.085) which is
#   not noticably better than PCA with 50 dim

for model in models:
    if model in ["timm_vit_base_patch16_224.mae"]:
        continue
    BEST_PARAMS["HDBSCAN"][model].update(
        {"dim_reducer_man": "UMAP", "ndim_reduced_man": 50, "dim_reducer": "None"}
    )

BEST_PARAMS["HDBSCAN"]["timm_vit_base_patch16_224.mae"].update(
    {
        "dim_reducer": "PCA",
        "pca_variance": 0.95,
        "zscore": True,
        "ndim_reduced": None,
        "dim_reducer_man": "None",
    }
)

# OPTICS
# Use UMAP for every encoder, no exceptions necessary
for model in models:
    BEST_PARAMS["OPTICS"][model].update(
        {"dim_reducer_man": "UMAP", "ndim_reduced_man": 50, "dim_reducer": "None"}
    )

In [None]:
BEST_PARAMS_v1 = copy.deepcopy(BEST_PARAMS)
BEST_PARAMS_v2 = BEST_PARAMS

print("Updating dim choices for new method")
# Updated dim choices
# (changed to this when we swapped to using weighted average instead of straight
# average between Imagenet-1k, Imagenette, Imagewoof)

# Changed KMeans clip_RN50 from PCA 500 to UMAP 50, so it uses fewer dimensions
# (probably more stable than using 500-d which is what PCA needs to marginally beat UMAP)
BEST_PARAMS_v2["KMeans"]["clip_RN50"].update(
    {"dim_reducer": None, "ndim_reduced": None, "zscore": False, "pca_variance": None}
)
BEST_PARAMS_v2["KMeans"]["clip_RN50"].update(
    {"dim_reducer_man": "UMAP", "ndim_reduced_man": 50}
)
# Changed KMeans MAE from PCA 85% to PCA 200
# (since we see perf above plateaus at 200-d, there is no point going above that)
BEST_PARAMS_v2["KMeans"]["timm_vit_base_patch16_224.mae"].update(
    {"dim_reducer": "PCA", "zscore": True, "ndim_reduced": 200, "pca_variance": None}
)
# Changed KMeans clip_vitb16 from PCA 500 to PCA 75%
# (gives a notably better train set AMI measurement above)
BEST_PARAMS_v2["KMeans"]["clip_vitb16"].update(
    {"dim_reducer": "PCA", "zscore": True, "ndim_reduced": None, "pca_variance": 0.75}
)

# Changed AffinityPropagation dino_resnet50 from PCA 95% to PCA 10
# (performance is basically equal, so no point using higher-dim space;
# could have done UMAP 50 instead with basically equal train AMI to PCA 10,
# but didn't for consistency with other models)
BEST_PARAMS_v2["AffinityPropagation"]["dino_resnet50"].update(
    {"dim_reducer": "PCA", "zscore": True, "ndim_reduced": 10, "pca_variance": None}
)
# Changed AffinityPropagation MAE from PCA 95% to PCA 100
BEST_PARAMS_v2["AffinityPropagation"]["timm_vit_base_patch16_224.mae"].update(
    {"dim_reducer": "PCA", "zscore": True, "ndim_reduced": 100, "pca_variance": None}
)

## Agglomerative Clustering

### AgglomerativeClustering metric and linkage

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "AgglomerativeClustering"
methods = ["ward", "complete", "average", "single"]
metrics = ["euclidean", "l1", "chebyshev", "cosine"]  # "arccos"

data = np.NaN * np.ones(
    (len(models), len(VALIDATION_DATASETS), len(methods), len(metrics))
)
cmds = []
for i_model, model in enumerate(models):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_method, method in enumerate(methods):
            for i_metric, metric in enumerate(metrics):
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    # "dim_reducer_man": "UMAP",
                    # "ndim_reduced_man": 50,
                    "distance_metric": metric,
                    "aggclust_linkage": method,
                    "aggclust_dist_thresh": None,
                }
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    if method == "ward" and metric not in ["euclidean", "arccos"]:
                        # expected not to exist
                        continue
                    print("No data for", filter)
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter}\nand {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        if dif_cols:
                            for col in dif_cols:
                                print(f"  {col}: {list(sdf[col])}")
                data[i_model, i_dataset, i_method, i_metric] = np.median(sdf["AMI"])

if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

In [None]:
data.shape

In [None]:
np.sum(np.isnan(data)) / data.size

In [None]:
cmap = categorical_cmap(len(metrics), len(methods))

width = 1 / (len(methods) * len(metrics) + 2)
YLIM = [-0.05, 1.05]
for i_model, model in enumerate(models):
    plt.figure(figsize=(12, 6))
    ax = plt.axes()
    # for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
    i = -1
    for i_metric, metric in enumerate(metrics):
        for i_method, method in enumerate(methods):
            i += 1
            plt.bar(
                np.arange(len(VALIDATION_DATASETS)) + i * width,
                data[i_model, :, i_method, i_metric],
                width=width,
                label=f"{metric}: {method}",
                color=cmap(i),
            )
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    ax.set_xticks(
        np.arange(len(VALIDATION_DATASETS)) + width * (i + 1) / 2, VALIDATION_DATASETS
    )
    plt.ylim(YLIM)
    plt.title(model)
    plt.ylabel("AMI")
    plt.show()

In [None]:
agglink_choices_rows = []
# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

cmap = categorical_cmap(len(metrics), len(methods))

avg_ami = np.average(data, axis=1, weights=weights_val)

width = 1 / (len(methods) + 2)
YLIM = [-0.05, 1.05]
for i_model, model in enumerate(models):
    plt.figure(figsize=(12, 6))
    ax = plt.axes()
    # for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
    i = -1
    min_ami = 1
    max_ami = 0
    for i_metric, metric in enumerate(metrics):
        for i_method, method in enumerate(methods):
            i += 1
            my_val = avg_ami[i_model, i_method, i_metric]
            if np.isnan(my_val):
                continue
            min_ami = np.nanmin([min_ami, my_val])
            max_ami = np.nanmax([max_ami, my_val])
            plt.bar(
                i_metric + i_method * width,
                my_val,
                width=width,
                label=f"{metric}: {method}",
                color=cmap(i),
            )
    # Exclude cosine distance because it doesn't make sense with UMAP generally
    # (the origin is ill-defined)
    best_method_idx, best_metric_idx = np.unravel_index(
        np.nanargmax(avg_ami[i_model, :, :3]),
        avg_ami[i_model, :, :3].shape,
    )
    best_ami = avg_ami[i_model, best_method_idx, best_metric_idx]
    ax.set_xticks(np.arange(len(metrics)) + width * (i_method + 1) / 2, metrics)
    YLIM = np.array([min_ami, max_ami])
    YLIM += np.array([-1, 1]) * 0.05 * (YLIM[1] - YLIM[0])
    plt.ylim(YLIM)
    plt.title(
        f"{model} : {metrics[best_metric_idx]} {methods[best_method_idx]}"
        f"  (AMI={best_ami:.3f})"
        f"  ... {np.sort(avg_ami[i_model][~np.isnan(avg_ami[i_model])], axis=None)[-1:-5:-1]}"
    )
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.ylabel("AMI")
    plt.show()
    row = {
        "model": model,
        "distance_metric": metrics[best_metric_idx],
        "aggclust_linkage": methods[best_method_idx],
        "AMI": best_ami,
    }
    agglink_choices_rows.append(row)

In [None]:
avg_ami[np.array(models) == "vicreg_resnet50", np.array(methods) == "average"]

In [None]:
avg_ami[np.array(models) == "vitb16"]

In [None]:
agglink_choices_df = pd.DataFrame.from_dict(agglink_choices_rows)
agglink_choices_df

In [None]:
print(agglink_choices_df.to_markdown())

With equal weighting between Imagenet, Imagenette, Imagewoof

|    | model                         | distance_metric   | aggclust_linkage   |
|---:|:------------------------------|:------------------|:-------------------|
|  0 | resnet50                      | euclidean         | ward               |
|  1 | mocov3_resnet50               | euclidean         | ward               |
|  2 | vicreg_resnet50               | euclidean         | ward               |
|  3 | dino_resnet50                 | euclidean         | average            |
|  4 | clip_RN50                     | euclidean         | average            |
|  5 | vitb16                        | euclidean         | ward               |
|  6 | mocov3_vit_base               | chebyshev         | average            |
|  7 | timm_vit_base_patch16_224.mae | euclidean         | ward               |
|  8 | dino_vitb16                   | euclidean         | average            |
|  9 | clip_vitb16                   | chebyshev         | average            |

With 2:1:1 weighting

|    | model                         | distance_metric   | aggclust_linkage   |      AMI |
|---:|:------------------------------|:------------------|:-------------------|---------:|
|  0 | resnet50                      | euclidean         | ward               | 0.880867 |
|  1 | mocov3_resnet50               | euclidean         | ward               | 0.661994 |
|  2 | vicreg_resnet50               | cosine            | average            | 0.623931 |
|  3 | dino_resnet50                 | euclidean         | average            | 0.610076 |
|  4 | clip_RN50                     | euclidean         | average            | 0.587426 |
|  5 | vitb16                        | euclidean         | ward               | 0.937149 |
|  6 | mocov3_vit_base               | chebyshev         | average            | 0.735425 |
|  7 | timm_vit_base_patch16_224.mae | cosine            | average            | 0.300642 |
|  8 | dino_vitb16                   | euclidean         | average            | 0.799998 |
|  9 | clip_vitb16                   | chebyshev         | average            | 0.702895 |

Excluding cosine

|    | model                         | distance_metric   | aggclust_linkage   |      AMI |
|---:|:------------------------------|:------------------|:-------------------|---------:|
|  0 | resnet50                      | euclidean         | ward               | 0.880867 |
|  1 | mocov3_resnet50               | euclidean         | ward               | 0.661994 |
|  2 | vicreg_resnet50               | euclidean         | average            | 0.623852 |
|  3 | dino_resnet50                 | euclidean         | average            | 0.610076 |
|  4 | clip_RN50                     | euclidean         | average            | 0.587426 |
|  5 | vitb16                        | euclidean         | ward               | 0.937149 |
|  6 | mocov3_vit_base               | chebyshev         | average            | 0.735425 |
|  7 | timm_vit_base_patch16_224.mae | euclidean         | ward               | 0.290479 |
|  8 | dino_vitb16                   | euclidean         | average            | 0.799998 |
|  9 | clip_vitb16                   | chebyshev         | average            | 0.702895 |


In [None]:
for model in [
    "resnet50",
    "mocov3_resnet50",
    "vicreg_resnet50",
    "vitb16",
    "timm_vit_base_patch16_224.mae",
]:
    BEST_PARAMS_v1["AgglomerativeClustering"][model].update(
        {
            "distance_metric": "euclidean",
            "aggclust_linkage": "ward",
        }
    )
for model in ["dino_resnet50", "clip_RN50", "dino_vitb16"]:
    BEST_PARAMS_v1["AgglomerativeClustering"][model].update(
        {
            "distance_metric": "euclidean",
            "aggclust_linkage": "average",
        }
    )
for model in ["mocov3_vit_base", "clip_vitb16"]:
    BEST_PARAMS_v1["AgglomerativeClustering"][model].update(
        {
            "distance_metric": "chebyshev",
            "aggclust_linkage": "average",
        }
    )

In [None]:
for model in ["resnet50", "mocov3_resnet50", "vitb16", "timm_vit_base_patch16_224.mae"]:
    BEST_PARAMS_v2["AgglomerativeClustering"][model].update(
        {
            "distance_metric": "euclidean",
            "aggclust_linkage": "ward",
        }
    )
for model in ["vicreg_resnet50", "dino_resnet50", "clip_RN50", "dino_vitb16"]:
    BEST_PARAMS_v2["AgglomerativeClustering"][model].update(
        {
            "distance_metric": "euclidean",
            "aggclust_linkage": "average",
        }
    )
for model in ["mocov3_vit_base", "clip_vitb16"]:
    BEST_PARAMS_v2["AgglomerativeClustering"][model].update(
        {
            "distance_metric": "chebyshev",
            "aggclust_linkage": "average",
        }
    )

In [None]:
for model in RESNET50_MODELS + VITB16_MODELS:
    BEST_PARAMS_v2["AgglomerativeClustering"][model].update(
        {
            "distance_metric": agglink_choices_df[agglink_choices_df["model"] == model][
                "distance_metric"
            ].item(),
            "aggclust_linkage": agglink_choices_df[
                agglink_choices_df["model"] == model
            ]["aggclust_linkage"].item(),
        }
    )

### AgglomerativeClustering distance threshold

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "AgglomerativeClustering"
distance_thresholds = [
    0.001,
    0.002,
    0.005,
    0.01,
    0.02,
    0.05,
    0.1,
    0.2,
    0.5,
    1.0,
    2.0,
    5.0,
    10.0,
    20.0,
    50.0,
    100.0,
    200.0,
    500.0,
    1000.0,
    2000.0,
    5000.0,
]

# Run with standardization to make things more comparable across datasets
BEST_PARAMS = BEST_PARAMS_v2
override_fields = {
    "zscore2": "average",
    "ndim_correction": True,
}
# No standardization (original configuration)
# override_fields = {}

data = np.NaN * np.ones(
    (len(models), len(VALIDATION_DATASETS), len(distance_thresholds))
)
cmds = []
for i_model, model in enumerate(models):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_thr, thr in enumerate(distance_thresholds):
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "aggclust_dist_thresh": thr,
            }
            filter.update(override_fields)
            sdf = select_rows(runs_df, filter, allow_missing=False)
            filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=False)
            if len(sdf) < 1:
                print(f"No data for {filter} {filter2}")
                cmds.append(filter2command(filter, filter2))
                continue
            if len(sdf) > 1:
                if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                    print()
                    print("More than one result with AMIs:", list(sdf["AMI"]))
                    print(f"for search {filter}\nand {filter2}")
                    dif_cols = find_differing_columns(sdf, config_keys)
                    print(f"columns which differ: {dif_cols}")
                    if dif_cols:
                        for col in dif_cols:
                            print(f"  {col}: {list(sdf[col])}")
            data[i_model, i_dataset, i_thr] = np.median(sdf["AMI"])

if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

In [None]:
data.shape

In [None]:
aggthresh_choices_rows = []

# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

# Convert to relative performance compared with optimal threshold
data_rel = data / np.nanmax(data, axis=-1, keepdims=True)
mu_ami = np.average(data, axis=1, weights=weights_val)
mu_amirel = np.average(data_rel, axis=1, weights=weights_val)

# my_data, option = data, "AMI"
my_data, option = data_rel, "rel-AMI"

for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    ax = plt.axes()
    # indiv
    # plt.plot(distance_thresholds, data[i_model].T, ":", color="grey")
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        ls = DATASET2LS.get(dataset)
        ls = ls + ("s" if dataset == "imagenet" else "o")
        c = "grey"  # "green" if dataset == "imagenet" else "grey"
        ms = 5  # 7 if dataset == "imagenet" else 5
        plt.plot(
            distance_thresholds, my_data[i_model, i_dataset], ls, markersize=ms, color=c
        )
    plt.xlabel("Distance threshold")
    plt.ylabel(option)
    plt.ylim([-0.05, 1.05])
    if not np.all(np.isnan(data[i_model])):
        plt.xscale("log")
    # mean
    mu_data = np.average(my_data[i_model], axis=0, weights=weights_val)
    if np.all(np.isnan(mu_data)):
        print(f"No data for {model}")
        continue
    i_thr = np.nanargmax(mu_data)
    best_thr = distance_thresholds[i_thr]
    plt.plot(best_thr, mu_data[i_thr], "x")
    plt.plot(distance_thresholds, mu_data, color="black")
    plt.title(f"{model} (thr={best_thr}, {option}={mu_data[i_thr]})")
    plt.ylim([-0.05, 1.05])
    plt.show()
    row = {
        "model": model,
        "aggclust_dist_thresh": best_thr,
        "rel-AMI": mu_amirel[i_model, i_thr],
        "AMI": mu_ami[i_model, i_thr],
    }
    aggthresh_choices_rows.append(row)

In [None]:
aggthresh_choices_rows

In [None]:
aggthresh_choices_df = pd.DataFrame.from_dict(aggthresh_choices_rows)
aggthresh_choices_df

In [None]:
print(aggthresh_choices_df.to_markdown())

Original version, with original metric/linkage selection (equal weighting), and equal weighting for threshold selection too.

|    | model                         |   aggclust_dist_thresh |   rel-AMI |      AMI |
|---:|:------------------------------|-----------------------:|----------:|---------:|
|  0 | resnet50                      |                   20   |  0.945248 | 0.863248 |
|  1 | mocov3_resnet50               |                   20   |  0.97133  | 0.702498 |
|  2 | vicreg_resnet50               |                   20   |  0.989263 | 0.663419 |
|  3 | dino_resnet50                 |                    1   |  0.988309 | 0.634068 |
|  4 | clip_RN50                     |                    1   |  0.973346 | 0.605612 |
|  5 | vitb16                        |                   20   |  0.957287 | 0.912559 |
|  6 | mocov3_vit_base               |                    1   |  0.930347 | 0.721339 |
|  7 | timm_vit_base_patch16_224.mae |                  200   |  0.824798 | 0.315614 |
|  8 | dino_vitb16                   |                    2   |  0.937301 | 0.790191 |
|  9 | clip_vitb16                   |                    0.5 |  0.954692 | 0.706133 |


Use original metric/linkage, but change to 2:1:1 dataset weighting for threshold selection instead

|    | model                         |   aggclust_dist_thresh |   rel-AMI |      AMI |
|---:|:------------------------------|-----------------------:|----------:|---------:|
|  0 | resnet50                      |                   10   |  0.937077 | 0.827762 |
|  1 | mocov3_resnet50               |                   20   |  0.965668 | 0.648536 |
|  2 | vicreg_resnet50               |                   20   |  0.985394 | 0.616488 |
|  3 | dino_resnet50                 |                    1   |  0.983208 | 0.593935 |
|  4 | clip_RN50                     |                    1   |  0.974061 | 0.57489  |
|  5 | vitb16                        |                   10   |  0.942607 | 0.884239 |
|  6 | mocov3_vit_base               |                    0.5 |  0.917765 | 0.671893 |
|  7 | timm_vit_base_patch16_224.mae |                  200   |  0.868598 | 0.306396 |
|  8 | dino_vitb16                   |                    2   |  0.908903 | 0.735008 |
|  9 | clip_vitb16                   |                    0.5 |  0.954431 | 0.672023 |

Change to selecting by rel-AMI instead (original metric/linkage, 2:1:1 weighting).

|    | model                         |   aggclust_dist_thresh |   rel-AMI |      AMI |
|---:|:------------------------------|-----------------------:|----------:|---------:|
|  0 | resnet50                      |                   10   |  0.937077 | 0.827762 |
|  1 | mocov3_resnet50               |                   20   |  0.965668 | 0.648536 |
|  2 | vicreg_resnet50               |                   20   |  0.985394 | 0.616488 |
|  3 | dino_resnet50                 |                    1   |  0.983208 | 0.593935 |
|  4 | clip_RN50                     |                    1   |  0.974061 | 0.57489  |
|  5 | vitb16                        |                   10   |  0.942607 | 0.884239 |
|  6 | mocov3_vit_base               |                    0.5 |  0.917765 | 0.671893 |
|  7 | timm_vit_base_patch16_224.mae |                  100   |  0.871321 | 0.294255 |
|  8 | dino_vitb16                   |                    1   |  0.911006 | 0.725832 |
|  9 | clip_vitb16                   |                    0.5 |  0.954431 | 0.672023 |

Changing to new linkage (inc cosine) as well as using normalization of reduced embeddings.
N.B. VICReg and MAE changed to cosine similarity when the linkage selection method was updated. (Work in progress.)

|    | model                         |   aggclust_dist_thresh |   rel-AMI |      AMI |
|---:|:------------------------------|-----------------------:|----------:|---------:|
|  0 | resnet50                      |                    2   |  0.948073 | 0.838784 |
|  1 | mocov3_resnet50               |                   10   |  0.966078 | 0.64681  |
|  2 | vicreg_resnet50               |                   10   |  0.971546 | 0.616755 |
|  3 | dino_resnet50                 |                    0.5 |  0.971055 | 0.590095 |
|  4 | clip_RN50                     |                    0.5 |  0.979172 | 0.577304 |
|  5 | vitb16                        |                    2   |  0.949133 | 0.889504 |
|  6 | mocov3_vit_base               |                    0.5 |  0.874281 | 0.631526 |
|  7 | timm_vit_base_patch16_224.mae |                    5   |  0.89496  | 0.29343  |
|  8 | clip_vitb16                   |                    1   |  0.939767 | 0.661225 |

Excluding cosine distance

|    | model                         |   aggclust_dist_thresh |   rel-AMI |      AMI |
|---:|:------------------------------|-----------------------:|----------:|---------:|
|  0 | resnet50                      |                    2   |  0.948073 | 0.838784 |
|  1 | mocov3_resnet50               |                   10   |  0.966078 | 0.64681  |
|  2 | vicreg_resnet50               |                    0.5 |  0.951247 | 0.586766 |
|  3 | dino_resnet50                 |                    0.5 |  0.971055 | 0.590095 |
|  4 | clip_RN50                     |                    0.5 |  0.979172 | 0.577304 |
|  5 | vitb16                        |                    2   |  0.946852 | 0.889504 |
|  6 | mocov3_vit_base               |                    1   |  0.896512 | 0.655649 |
|  7 | timm_vit_base_patch16_224.mae |                    5   |  0.89496  | 0.29343  |
|  8 | dino_vitb16                   |                    0.2 |  0.905301 | 0.722863 |
|  9 | clip_vitb16                   |                    1   |  0.939767 | 0.661225 |


In [None]:
# Run AgglomerativeClustering experiments with number of clusters unknown
# 	resnet50        	20.0
# 	mocov3_resnet50 	20.0
# 	vicreg_resnet50 	20.0
# 	vitb16 	            20.0
# 	dino_resnet50     	 1.0
# 	clip_RN50 	         1.0
# 	dino_vitb16 	     2.0
# 	mocov3_vit_base 	 1.0
# 	clip_vitb16 	     0.5
# 	timm_vit_base_patch16_224.mae 	200.0

for model in ["resnet50", "mocov3_resnet50", "vicreg_resnet50", "vitb16"]:
    BEST_PARAMS_v1["AgglomerativeClustering"][model].update(
        {"aggclust_dist_thresh": 20.0}
    )
for model in ["dino_resnet50", "clip_RN50", "mocov3_vit_base"]:
    BEST_PARAMS_v1["AgglomerativeClustering"][model].update(
        {"aggclust_dist_thresh": 1.0}
    )
BEST_PARAMS_v1["AgglomerativeClustering"]["dino_vitb16"]["aggclust_dist_thresh"] = 2.0
BEST_PARAMS_v1["AgglomerativeClustering"]["clip_vitb16"]["aggclust_dist_thresh"] = 0.5
BEST_PARAMS_v1["AgglomerativeClustering"]["timm_vit_base_patch16_224.mae"][
    "aggclust_dist_thresh"
] = 200.0

In [None]:
BEST_PARAMS_v2["AgglomerativeClustering"]["resnet50"]["aggclust_dist_thresh"] = 2.0
BEST_PARAMS_v2["AgglomerativeClustering"]["mocov3_resnet50"][
    "aggclust_dist_thresh"
] = 10.0
BEST_PARAMS_v2["AgglomerativeClustering"]["vicreg_resnet50"][
    "aggclust_dist_thresh"
] = 0.5
BEST_PARAMS_v2["AgglomerativeClustering"]["dino_resnet50"]["aggclust_dist_thresh"] = 0.5
BEST_PARAMS_v2["AgglomerativeClustering"]["clip_RN50"]["aggclust_dist_thresh"] = 0.5
BEST_PARAMS_v2["AgglomerativeClustering"]["vitb16"]["aggclust_dist_thresh"] = 2.0
BEST_PARAMS_v2["AgglomerativeClustering"]["mocov3_vit_base"][
    "aggclust_dist_thresh"
] = 1.0
BEST_PARAMS_v2["AgglomerativeClustering"]["timm_vit_base_patch16_224.mae"][
    "aggclust_dist_thresh"
] = 5.0
BEST_PARAMS_v2["AgglomerativeClustering"]["dino_vitb16"]["aggclust_dist_thresh"] = 0.2
BEST_PARAMS_v2["AgglomerativeClustering"]["clip_vitb16"]["aggclust_dist_thresh"] = 1.0

In [None]:
for model in RESNET50_MODELS + VITB16_MODELS:
    BEST_PARAMS_v2["AgglomerativeClustering"][model].update(
        {
            "aggclust_dist_thresh": aggthresh_choices_df[
                aggthresh_choices_df["model"] == model
            ]["aggclust_dist_thresh"].item(),
        }
    )

## Affinity Prop

### Convergence threshold

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "AffinityPropagation"
convergence_thresholds = [15, 20, 30, 45, 60, 90]

override_fields = {}

data = np.NaN * np.ones(
    (len(models), len(VALIDATION_DATASETS), len(convergence_thresholds))
)
cmds = []
for i_model, model in enumerate(models):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_thr, thr in enumerate(convergence_thresholds):
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "affinity_conv_iter": thr,
            }
            filter.update(override_fields)
            sdf = select_rows(runs_df, filter, allow_missing=False)
            filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=False)
            if len(sdf) < 1:
                if dataset == "imagenet":
                    continue
                print(f"No data for {filter} {filter2}")
                cmds.append(filter2command(filter, filter2))
                continue
            if len(sdf) > 1:
                if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                    print()
                    print("More than one result with AMIs:", list(sdf["AMI"]))
                    print(f"for search {filter}\nand {filter2}")
                    dif_cols = find_differing_columns(sdf, config_keys)
                    print(f"columns which differ: {dif_cols}")
                    if dif_cols:
                        for col in dif_cols:
                            print(f"  {col}: {list(sdf[col])}")
            data[i_model, i_dataset, i_thr] = np.median(sdf["AMI"])

if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

In [None]:
affthresh_choices_rows = []

for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    ax = plt.axes()
    # indiv
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        ls = DATASET2LS.get(dataset)
        ls = ls + ("s" if dataset == "imagenet" else "o")
        plt.plot(
            convergence_thresholds,
            data[i_model, i_dataset],
            ls,
            markersize=5,
            color="grey",
        )
    # mean
    mu_data = np.nanmean(data[i_model], axis=0)
    if np.all(np.isnan(mu_data)):
        print(f"No data for {model}")
        continue
    i_thr = np.nanargmax(mu_data)
    best_thr = convergence_thresholds[i_thr]
    plt.plot(best_thr, mu_data[i_thr], "x")
    plt.plot(convergence_thresholds, mu_data, color="black")
    plt.title(f"{model} (thr={best_thr}, AMI={mu_data[i_thr]})")
    # plt.xscale("log")
    plt.xlabel("Convergence threshold")
    plt.ylabel("AMI")
    plt.ylim([-0.05, 1.05])
    # plt.xscale("log")
    plt.grid()
    plt.show()
    row = {"model": model, "affinity_conv_iter": best_thr}
    affthresh_choices_rows.append(row)

### Damping

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "AffinityPropagation"
damping_values = [0.5, 0.6, 0.7, 0.8, 0.9]
affinity_conv_iter = 90

override_fields = {}

data = np.NaN * np.ones((len(models), len(VALIDATION_DATASETS), len(damping_values)))
cmds = []
for i_model, model in enumerate(models):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_damping, damping in enumerate(damping_values):
            filter = {
                "model": model,
                "dataset": dataset,
                "clusterer": clusterer,
                "affinity_conv_iter": affinity_conv_iter,
                "affinity_damping": damping,
            }
            filter.update(override_fields)
            sdf = select_rows(runs_df, filter, allow_missing=False)
            filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
            filter2 = {k: v for k, v in filter2.items() if k not in filter}
            sdf = select_rows(sdf, filter2, allow_missing=False)
            if len(sdf) < 1:
                if dataset == "imagenet":
                    continue
                print(f"No data for {filter} {filter2}")
                cmds.append(filter2command(filter, filter2))
                continue
            if len(sdf) > 1:
                if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                    print()
                    print("More than one result with AMIs:", list(sdf["AMI"]))
                    print(f"for search {filter}\nand {filter2}")
                    dif_cols = find_differing_columns(sdf, config_keys)
                    print(f"columns which differ: {dif_cols}")
                    if dif_cols:
                        for col in dif_cols:
                            print(f"  {col}: {list(sdf[col])}")
            data[i_model, i_dataset, i_damping] = np.median(sdf["AMI"])

if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

In [None]:
affdamping_choices_rows = []

for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    ax = plt.axes()
    # indiv
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        ls = DATASET2LS.get(dataset)
        ls = ls + ("s" if dataset == "imagenet" else "o")
        plt.plot(
            damping_values, data[i_model, i_dataset], ls, markersize=5, color="grey"
        )
    # mean
    mu_data = np.nanmean(data[i_model], axis=0)
    if np.all(np.isnan(mu_data)):
        print(f"No data for {model}")
        continue
    i_thr = np.nanargmax(mu_data)
    best_thr = damping_values[i_thr]
    plt.plot(best_thr, mu_data[i_thr], "x")
    plt.plot(damping_values, mu_data, color="black")
    plt.title(f"{model} (thr={best_thr}, AMI={mu_data[i_thr]})")
    plt.xlabel("Damping")
    plt.ylabel("AMI")
    plt.ylim([-0.05, 1.05])
    # plt.xscale("log")
    plt.grid()
    plt.show()
    row = {"model": model, "affinity_conv_iter": best_thr}
    affdamping_choices_rows.append(row)

## HDBSCAN redux

Redo HDBSCAN method and metric selection with dim reduction in place.

### EOM vs leaf and distance metric

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "HDBSCAN"
methods = ["eom", "leaf"]
metrics = ["euclidean", "l1", "chebyshev", "arccos"]  # , "braycurtis", "canberra"]

data = np.NaN * np.ones(
    (len(models), len(VALIDATION_DATASETS), len(methods), len(metrics))
)
cmds = []
for i_model, model in enumerate(models):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_method, method in enumerate(methods):
            for i_metric, metric in enumerate(metrics):
                if metric == "cosine":
                    continue
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    "distance_metric": metric,
                    "hdbscan_method": method,
                }
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    print("No data for", filter)
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter} and {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        if dif_cols:
                            for col in dif_cols:
                                print(f"  {col}: {list(sdf[col])}")
                data[i_model, i_dataset, i_method, i_metric] = np.median(sdf["AMI"])

for cmd in cmds:
    print(cmd)

In [None]:
np.mean(np.mean(data, axis=1), axis=0)

In [None]:
np.nanmean(np.nanmean(data, axis=1), axis=0)

In [None]:
data[0, 0, 0, :]

In [None]:
max_data = np.nanmax(data)
YLIM = [-0.05 * max_data, 1.05 * max_data]
for i_method, method in enumerate(methods):
    plt.figure(figsize=(10, 8))
    ax = plt.axes()
    i = 0
    for i_model, model in enumerate(RESNET50_MODELS):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            plt.plot(
                data[i_model, i_dataset, i_method, :],
                "x",
                label=f"{dataset}: {model}",
                c=cmap(i),
            )
            i += 1
    # plt.legend()
    ax.set_xticks(np.arange(len(metrics)), metrics)
    plt.ylim(YLIM)
    plt.title(method)
    plt.show()

In [None]:
width = 0.02
YLIM = [-0.05, 1.05]
for i_method, method in enumerate(methods):
    plt.figure(figsize=(15, 8))
    ax = plt.axes()
    i = 0
    for i_model, model in enumerate(models):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            plt.bar(
                np.arange(len(metrics)) + i * width,
                data[i_model, i_dataset, i_method, :],
                width=width,
                label=f"{dataset}: {model}",
                color=cmap(i),
            )
            i += 1
    # plt.legend()
    ax.set_xticks(np.arange(len(metrics)) + width * (i + 1) / 2, metrics)
    plt.ylim(YLIM)
    plt.title(method)
    plt.show()

In [None]:
data.shape

In [None]:
width = 0.1
YLIM = [-0.05, 1.05]
cmap = categorical_cmap(len(methods), len(metrics))
for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    ax = plt.axes()
    i = 0
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_method, method in enumerate(methods):
            for i_metric, metric in enumerate(metrics):
                plt.bar(
                    i_dataset + (i_metric + (i_method * len(metrics))) * width,
                    data[i_model, i_dataset, i_method, i_metric],
                    width=width,
                    label=f"{dataset}: {method}, {metric}",
                    color=cmap(i_metric + (i_method * len(metrics))),
                )
                i += 1
    # plt.legend()
    ax.set_xticks(
        np.arange(len(VALIDATION_DATASETS)) + width * (len(metrics) - 0.5),
        VALIDATION_DATASETS,
    )
    plt.ylim(YLIM)
    plt.ylabel("AMI")
    plt.title(model)
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.show()

In [None]:
# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

hdbscan_choices_rows = []
width = 0.1

avg_ami = np.average(data, axis=1, weights=weights_val)
for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    min_ami = 1
    max_ami = 0
    ax = plt.axes()
    i = 0
    best_str = "NO DATA"
    row = {}
    for i_method, method in enumerate(methods):
        my_val = avg_ami[i_model, i_method, :]
        plt.bar(
            np.arange(len(metrics)) + i * width,
            my_val,
            width=width * len(VALIDATION_DATASETS),
            label=f"{model} {method}",
            color=cmap(i),
        )
        my_val_best = np.nanmax(my_val)
        min_ami = np.nanmin([min_ami, np.nanmin(my_val)])
        max_ami = np.nanmax([max_ami, my_val_best])
        i += len(methods)

    # Exclude arccos distance because it doesn't make sense with UMAP generally
    # (the origin is ill-defined)
    best_method_idx, best_metric_idx = np.unravel_index(
        np.nanargmax(avg_ami[i_model, :, :3]),
        avg_ami[i_model, :, :3].shape,
    )
    best_ami = avg_ami[i_model, best_method_idx, best_metric_idx]
    ax.set_xticks(
        np.arange(len(metrics)) + width * (i - len(VALIDATION_DATASETS)) / 2, metrics
    )
    YLIM = np.array([min_ami, max_ami])
    YLIM += np.array([-1, 1]) * 0.05 * (YLIM[1] - YLIM[0])
    plt.ylim(YLIM)
    plt.ylabel("AMI")
    top_k = np.sort(avg_ami[i_model][~np.isnan(avg_ami[i_model])], axis=None)[-2:-4:-1]
    plt.title(
        f"{model} : {metrics[best_metric_idx]} {methods[best_method_idx]}"
        f"  (AMI={best_ami:.3f})  ... {top_k}"
    )
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.show()
    row = {
        "model": model,
        "distance_metric": metrics[best_metric_idx],
        "hdbscan_method": methods[best_method_idx],
        "AMI": best_ami,
    }
    hdbscan_choices_rows.append(row)

In [None]:
# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

width = 0.13

avg_ami = np.average(data, axis=1, weights=weights_val)
for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    min_ami = 1
    max_ami = 0
    ax = plt.axes()
    i = 0
    best_str = "NO DATA"
    for i_method, method in enumerate(methods[:1]):
        my_val = avg_ami[i_model, i_method, :]
        plt.bar(
            np.arange(len(metrics)) + i * width,
            my_val,
            width=width * len(VALIDATION_DATASETS),
            label=f"{dataset}: {model}",
            color=cmap(i),
        )
        my_val_best = np.nanmax(my_val)
        if my_val_best > max_ami:
            best_metric_idx = np.nanargmax(my_val)
            best_str = (
                f"{method} with {metrics[best_metric_idx]} (AMI={my_val_best:.5f})"
            )
        min_ami = np.nanmin([min_ami, np.nanmin(my_val)])
        max_ami = np.nanmax([max_ami, my_val_best])
        i += len(VALIDATION_DATASETS)
    # plt.legend()
    ax.set_xticks(
        np.arange(len(metrics)) + width * (i - len(VALIDATION_DATASETS)) / 2, metrics
    )
    YLIM = np.array([min_ami, max_ami])
    YLIM += np.array([-1, 1]) * 0.05 * (YLIM[1] - YLIM[0])
    plt.ylim(YLIM)
    plt.ylabel("AMI")
    top_k = np.sort(avg_ami[i_model][~np.isnan(avg_ami[i_model])], axis=None)[-2:-4:-1]
    plt.title(f"{model} : {best_str}  ... {top_k}")
    plt.show()

In [None]:
hdbscan_choices_df = pd.DataFrame.from_dict(hdbscan_choices_rows)
hdbscan_choices_df

In [None]:
print(hdbscan_choices_df.to_markdown())

In [None]:
for model in RESNET50_MODELS + VITB16_MODELS:
    BEST_PARAMS_v1["HDBSCAN"][model].update(
        {
            "distance_metric": "euclidean",
            "hdbscan_method": "eom",
        }
    )

v2 selection

|    | model                         | distance_metric   | hdbscan_method   |      AMI |
|---:|:------------------------------|:------------------|:-----------------|---------:|
|  0 | resnet50                      | euclidean         | eom              | 0.828368 |
|  1 | mocov3_resnet50               | euclidean         | eom              | 0.531644 |
|  2 | vicreg_resnet50               | l1                | eom              | 0.472324 |
|  3 | dino_resnet50                 | l1                | eom              | 0.503147 |
|  4 | clip_RN50                     | l1                | eom              | 0.461363 |
|  5 | vitb16                        | chebyshev         | eom              | 0.906110 |
|  6 | mocov3_vit_base               | euclidean         | eom              | 0.629966 |
|  7 | timm_vit_base_patch16_224.mae | euclidean         | eom              | 0.070495 |
|  8 | dino_vitb16                   | l1                | eom              | 0.691547 |
|  9 | clip_vitb16                   | l1                | eom              | 0.592489 |

In [None]:
for model in RESNET50_MODELS + VITB16_MODELS:
    BEST_PARAMS_v2["HDBSCAN"][model].update(
        {
            "distance_metric": "euclidean",
            "hdbscan_method": "eom",
        }
    )
for model in [
    "vicreg_resnet50",
    "dino_resnet50",
    "clip_RN50",
    "dino_vitb16",
    "clip_vitb16",
]:
    BEST_PARAMS_v2["HDBSCAN"][model].update(
        {
            "distance_metric": "l1",
        }
    )
BEST_PARAMS_v2["HDBSCAN"][model]["distance_metric"] = "vitb16"

In [None]:
for model in RESNET50_MODELS + VITB16_MODELS:
    BEST_PARAMS_v2["HDBSCAN"][model].update(
        {
            "distance_metric": hdbscan_choices_df[hdbscan_choices_df["model"] == model][
                "distance_metric"
            ].item(),
            "hdbscan_method": hdbscan_choices_df[hdbscan_choices_df["model"] == model][
                "hdbscan_method"
            ].item(),
        }
    )

## OPTICS (WIP)

### dbscan vs xi (with xi threshold)

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "OPTICS"
metrics = ["euclidean", "l1", "chebyshev", "cosine"]
distance_thresholds = [
    0.01,
    0.02,
    0.03,
    0.04,
    0.05,
    0.07,
    0.1,
    0.15,
    0.2,
    0.3,
    0.4,
    0.5,
]

# Run with standardization to make things more comparable across datasets
override_fields = {
    "zscore2": "average",
    "ndim_correction": True,
}
# No standardization (original configuration)
# override_fields = {}

data = np.NaN * np.ones(
    (len(models), len(VALIDATION_DATASETS), len(metrics), len(distance_thresholds))
)
cmds = []
for i_model, model in enumerate(models):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_metric, metric in enumerate(metrics):
            for i_thr, thr in enumerate(distance_thresholds):
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    "distance_metric": metric,
                    "optics_method": "xi",
                    "aggclust_dist_thresh": thr,
                }
                filter.update(override_fields)
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    continue
                    print(f"No data for {filter} {filter2}")
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter}\nand {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        if dif_cols:
                            for col in dif_cols:
                                print(f"  {col}: {list(sdf[col])}")
                data[i_model, i_dataset, i_metric, i_thr] = np.median(sdf["AMI"])

if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

In [None]:
models = RESNET50_MODELS + VITB16_MODELS
cmap = categorical_cmap(len(models), len(VALIDATION_DATASETS))
clusterer = "OPTICS"
methods = ["dbscan", "xi"]
metrics = ["euclidean", "l1", "chebyshev", "cosine"]

data = np.NaN * np.ones(
    (len(models), len(VALIDATION_DATASETS), len(methods), len(metrics))
)
cmds = []
for i_model, model in enumerate(models):
    for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
        for i_method, method in enumerate(methods):
            for i_metric, metric in enumerate(metrics):
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                    "distance_metric": metric,
                    "optics_method": method,
                }
                if metric == "xi":
                    filter["optics_xi"] = 0.05
                else:
                    filter["optics_xi"] = None
                sdf = select_rows(runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    print("No data for", filter)
                    cmds.append(filter2command(filter, filter2))
                    continue
                if len(sdf) > 1:
                    if sum(sdf["AMI"] != sdf.iloc[0]["AMI"]) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter} and {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        if dif_cols:
                            for col in dif_cols:
                                print(f"  {col}: {list(sdf[col])}")
                data[i_model, i_dataset, i_method, i_metric] = np.median(sdf["AMI"])

for cmd in cmds:
    print(cmd)

In [None]:
np.mean(np.mean(data, axis=1), axis=0)

In [None]:
np.nanmean(np.nanmean(data, axis=1), axis=0)

In [None]:
data[0, 0, 0, :]

In [None]:
max_data = np.nanmax(data)
YLIM = [-0.05 * max_data, 1.05 * max_data]
for i_method, method in enumerate(methods):
    plt.figure(figsize=(10, 8))
    ax = plt.axes()
    i = 0
    for i_model, model in enumerate(RESNET50_MODELS):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            plt.plot(
                data[i_model, i_dataset, i_method, :],
                "x",
                label=f"{dataset}: {model}",
                c=cmap(i),
            )
            i += 1
    # plt.legend()
    ax.set_xticks(np.arange(len(metrics)), metrics)
    plt.ylim(YLIM)
    plt.title(method)
    plt.show()

In [None]:
width = 0.02
YLIM = [-0.05, 1.05]
for i_method, method in enumerate(methods):
    plt.figure(figsize=(15, 8))
    ax = plt.axes()
    i = 0
    for i_model, model in enumerate(models):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            plt.bar(
                np.arange(len(metrics)) + i * width,
                data[i_model, i_dataset, i_method, :],
                width=width,
                label=f"{dataset}: {model}",
                color=cmap(i),
            )
            i += 1
    # plt.legend()
    ax.set_xticks(np.arange(len(metrics)) + width * (i + 1) / 2, metrics)
    plt.ylim(YLIM)
    plt.title(method)
    plt.show()

In [None]:
data.shape

In [None]:
width = 0.12
YLIM = [-0.05, 1.05]
for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    ax = plt.axes()
    i = 0
    for i_method, method in enumerate(methods):
        for i_dataset, dataset in enumerate(VALIDATION_DATASETS):
            plt.bar(
                np.arange(len(metrics)) + i * width,
                data[i_model, i_dataset, i_method, :],
                width=width,
                label=f"{dataset}: {method}",
                color=cmap(i),
            )
            i += 1
    # plt.legend()
    ax.set_xticks(np.arange(len(metrics)) + width * (i - 1) / 2, metrics)
    plt.ylim(YLIM)
    plt.ylabel("AMI")
    plt.title(model)
    plt.show()

In [None]:
# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

hdbscan_choices_rows = []
width = 0.13

avg_ami = np.average(data, axis=1, weights=weights_val)
for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    min_ami = 1
    max_ami = 0
    ax = plt.axes()
    i = 0
    best_str = "NO DATA"
    row = {}
    for i_method, method in enumerate(methods):
        my_val = avg_ami[i_model, i_method, :]
        plt.bar(
            np.arange(len(metrics)) + i * width,
            my_val,
            width=width * len(VALIDATION_DATASETS),
            label=f"{model} {method}",
            color=cmap(i),
        )
        my_val_best = np.nanmax(my_val)
        min_ami = np.nanmin([min_ami, np.nanmin(my_val)])
        max_ami = np.nanmax([max_ami, my_val_best])
        i += len(VALIDATION_DATASETS)

    best_method_idx, best_metric_idx = np.unravel_index(
        np.nanargmax(avg_ami[i_model]),
        avg_ami[i_model].shape,
    )
    ax.set_xticks(
        np.arange(len(metrics)) + width * (i - len(VALIDATION_DATASETS)) / 2, metrics
    )
    YLIM = np.array([min_ami, max_ami])
    YLIM += np.array([-1, 1]) * 0.05 * (YLIM[1] - YLIM[0])
    plt.ylim(YLIM)
    plt.ylabel("AMI")
    top_k = np.sort(avg_ami[i_model][~np.isnan(avg_ami[i_model])], axis=None)[-2:-4:-1]
    plt.title(
        f"{model} : {metrics[best_metric_idx]} {methods[best_method_idx]}"
        f"  (AMI={avg_ami[i_model, best_method_idx, best_metric_idx]:.3f})"
        f"  ... {top_k}"
    )
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.show()
    row = {
        "model": model,
        "distance_metric": metrics[best_metric_idx],
        "hdbscan_method": methods[best_method_idx],
    }
    hdbscan_choices_rows.append(row)

In [None]:
# weights_val = np.ones((len(VALIDATION_DATASETS), ), dtype=int)
weights_val = np.array([2 if d == "imagenet" else 1 for d in VALIDATION_DATASETS])

width = 0.13

avg_ami = np.average(data, axis=1, weights=weights_val)
for i_model, model in enumerate(models):
    plt.figure(figsize=(10, 5))
    min_ami = 1
    max_ami = 0
    ax = plt.axes()
    i = 0
    best_str = "NO DATA"
    for i_method, method in enumerate(methods[:1]):
        my_val = avg_ami[i_model, i_method, :]
        plt.bar(
            np.arange(len(metrics)) + i * width,
            my_val,
            width=width * len(VALIDATION_DATASETS),
            label=f"{dataset}: {model}",
            color=cmap(i),
        )
        my_val_best = np.nanmax(my_val)
        if my_val_best > max_ami:
            best_metric_idx = np.nanargmax(my_val)
            best_str = (
                f"{method} with {metrics[best_metric_idx]} (AMI={my_val_best:.5f})"
            )
        min_ami = np.nanmin([min_ami, np.nanmin(my_val)])
        max_ami = np.nanmax([max_ami, my_val_best])
        i += len(VALIDATION_DATASETS)
    # plt.legend()
    ax.set_xticks(
        np.arange(len(metrics)) + width * (i - len(VALIDATION_DATASETS)) / 2, metrics
    )
    YLIM = np.array([min_ami, max_ami])
    YLIM += np.array([-1, 1]) * 0.05 * (YLIM[1] - YLIM[0])
    plt.ylim(YLIM)
    plt.ylabel("AMI")
    top_k = np.sort(avg_ami[i_model][~np.isnan(avg_ami[i_model])], axis=None)[-2:-4:-1]
    plt.title(f"{model} : {best_str}  ... {top_k}")
    plt.show()

In [None]:
hdbscan_choices_df = pd.DataFrame.from_dict(hdbscan_choices_rows)
hdbscan_choices_df

In [None]:
print(hdbscan_choices_df.to_markdown())

In [None]:
for model in RESNET50_MODELS + VITB16_MODELS:
    BEST_PARAMS["HDBSCAN"][model].update(
        {
            "distance_metric": hdbscan_choices_df[hdbscan_choices_df["model"] == model][
                "distance_metric"
            ].item(),
            "hdbscan_method": hdbscan_choices_df[hdbscan_choices_df["model"] == model][
                "hdbscan_method"
            ].item(),
        }
    )

# Final results

In [None]:
import datetime

In [None]:
TEST_DATASETS = [
    "imagenet",
    "cifar10",
    "cifar100",
    "mnist",
    "fashionmnist",
    "svhn",
    "flowers102",
    "aircraft",
    "nabirds",
    "inaturalist",
]
DATASET2SH = {
    "aircraft": "Aircraft",
    "cifar10": "C10",
    "cifar100": "C100",
    "flowers102": "Flowers",
    "fashionmnist": "fMNIST",
    "imagenet": "IN1k",
    "imagenette": "IN10",
    "imagewoof": "INwf",
    "inaturalist": "iNat21",
    "mnist": "MNIST",
    "nabirds": "NABirds",
    "svhn": "SVHN",
}
MODEL_GROUPS = {
    "ResNet-50": RESNET50_MODELS,
    "ViT-B": VITB16_MODELS,
}
MODEL2SH = {
    "resnet50": "Supervised",
    "mocov3_resnet50": "MoCo-v3",
    "vicreg_resnet50": "VICReg",
    "dino_resnet50": "DINO",
    "clip_RN50": "CLIP",
    "vitb16": "Supervised",
    "mocov3_vit_base": "MoCo-v3",
    "timm_vit_base_patch16_224.mae": "MAE",
    "dino_vitb16": "DINO",
    "clip_vitb16": "CLIP",
}
CLUSTERER2SH = {
    "KMeans": "K-Means",
    "AffinityPropagation": "Affinity Prop",
    "AgglomerativeClustering": "AC",
}

## Fetch results

In [None]:
# Project is specified by <entity/project-name>
api = wandb.Api()
runs_test = api.runs(
    "uoguelph_mlrg/zs-ssl-clustering",
    filters={"state": "Finished", "config.partition": "test"},
)
len(runs_test)

In [None]:
summary_list, config_list, name_list = [], [], []
for run in runs_test:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)
    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})
    # .name is the human-readable name of the run.
    name_list.append(run.name)

rows = []
config_keys = set()
summary_keys = set()
for summary, config, name in zip(summary_list, config_list, name_list):
    row = {"name": name}
    row.update({k: v for k, v in config.items() if not k.startswith("_")})
    row.update({k: v for k, v in summary.items() if not k.startswith("_")})
    row["_timestamp"] = summary["_timestamp"]
    rows.append(row)
    config_keys = config_keys.union(config.keys())
    summary_keys = summary_keys.union(summary.keys())

test_runs_df = pd.DataFrame.from_records(rows)

# Handle changed default value for spectral_assigner after config arg was introduced
if "spectral_assigner" not in test_runs_df.columns:
    test_runs_df["spectral_assigner"] = None
select = test_runs_df["clusterer_name"] != "SpectralClustering"
test_runs_df.loc[select, "spectral_assigner"] = None
select = (test_runs_df["clusterer_name"] == "SpectralClustering") & pd.isna(
    test_runs_df["spectral_assigner"]
)
test_runs_df.loc[select, "spectral_assigner"] = "kmeans"

if "zscore2" not in test_runs_df.columns:
    test_runs_df["zscore2"] = False
test_runs_df.loc[pd.isna(test_runs_df["zscore2"]), "zscore2"] = False

if "ndim_correction" not in test_runs_df.columns:
    test_runs_df["ndim_correction"] = False
test_runs_df.loc[pd.isna(test_runs_df["ndim_correction"]), "ndim_correction"] = False

In [None]:
config_keys = config_keys.difference(
    {"workers", "memory_avail_GB", "memory_total_GB", "memory_slurm"}
)

In [None]:
test_runs_df

In [None]:
list(test_runs_df["dataset_name"].unique())

In [None]:
metric_key = "AMI"
show_pc = True
show_fmt = "{:5.1f}"
eps = 0.001
override_fields = {
    # "aggclust_dist_thresh": None,  # to flip between unknown/known n clusters for AC
}
BEST_PARAMS = BEST_PARAMS_v1

# KMeans  AffinityPropagation  AgglomerativeClustering  HDBSCAN
clusterer = "AgglomerativeClustering"

best_results = {k: [] for k in TEST_DATASETS}
for dummy in [True, False]:
    cmds = []
    latex_table = r"% Results for " + f"{metric_key}, {clusterer}" + "\n"
    now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    latex_table += r"% Generated " + now_str + "\n"
    latex_table += r"\label{tab:" + clusterer + r"}" + "\n"
    latex_table += r"\resizebox{\textwidth}{!}{%" + "\n"
    latex_table += r"\begin{tabular}{ll" + r"r" * len(TEST_DATASETS) + r"}" + "\n"
    latex_table += r"\toprule" + "\n"
    latex_table += r"& " + f"{'Encoder':<11s}"
    for dataset in TEST_DATASETS:
        latex_table += r"&" + "{:^15s}".format(DATASET2SH.get(dataset, dataset))
    latex_table += r"\\" + "\n"
    latex_table += r"\toprule" + "\n"
    for i_group, model_group_name in enumerate(list(MODEL_GROUPS.keys())):
        if i_group > 0:
            latex_table += r"\midrule" + "\n"
        for i_model, model in enumerate(MODEL_GROUPS[model_group_name]):
            if i_model == 0:
                latex_table += (
                    r"\parbox[t]{2mm}{\multirow{5}{*}{\rotatebox[origin=c]{90}{"
                    + model_group_name
                    + "}}}"
                )
                latex_table += "\n"
            latex_table += f"& {MODEL2SH.get(model, model):<10s}"
            for i_dataset, dataset in enumerate(TEST_DATASETS):
                latex_table += " &"
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                }
                sdf = select_rows(test_runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                filter2.update(override_fields)
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    print(f"No data for {filter} {filter2}")
                    if clusterer == "AffinityPropagation" and dataset in [
                        "imagenet",
                        "inaturalist",
                    ]:
                        continue
                        pass
                    cmds.append(filter2command(filter, filter2, partition="test"))
                    continue
                if len(sdf) > 1:
                    perf = sdf.iloc[0]["AMI"]
                    if sum(sdf["AMI"] != perf) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter}\nand {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        if dif_cols:
                            for col in dif_cols:
                                print(f"  {col}: {list(sdf[col])}")
                my_val = np.median(sdf[metric_key])
                if dummy:
                    best_results[dataset].append(my_val)
                    continue
                is_best = my_val + eps >= np.max(best_results[dataset])
                if len(best_results[dataset]) > 1:
                    is_secd = my_val + eps >= np.sort(best_results[dataset])[-2]
                else:
                    is_secd = False
                if show_pc:
                    my_val = my_val * 100
                latex_table += " $"
                if is_best:
                    latex_table += r"\tcf{"
                elif is_secd:
                    latex_table += r"\tcs{"
                else:
                    latex_table += "     "
                latex_table += show_fmt.format(my_val)
                latex_table += r"}" if is_best or is_secd else " "
                latex_table += "$"
            latex_table += r" \\" + "\n"
    latex_table += r"\bottomrule" + "\n"
    latex_table += r"\end{tabular}" + "\n"
    latex_table += r"}" + "\n"


if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

print()
print("Done!")
print()
print(f"Here is your results table for {clusterer}:")
print()
print()
print(latex_table)

In [None]:
filter

In [None]:
filter2

In [None]:
sdf = select_rows(test_runs_df, filter, allow_missing=False)
sdf

In [None]:
ff = {
    #    'dim_reducer': 'None',
    #    'dim_reducer_man': 'UMAP',
    #    'zscore': False,
    #    'normalize': False,
    #    'zscore2': False,
    #    'ndim_correction': False,
    #    'distance_metric': 'chebyshev',
    #    'aggclust_linkage': 'average',
    #    'ndim_reduced_man': 50,
    "aggclust_dist_thresh": 2.0,
}
select_rows(sdf, ff, allow_missing=False)

In [None]:
sdf["aggclust_dist_thresh"]

In [None]:
"aggclust_dist_thresh" in sdf.columns

## Grouping by encoder

In [None]:
metric_key = "AMI"
show_pc = True
show_fmt = "{:4.0f}"
eps = 0.001
override_fields = {
    # "aggclust_dist_thresh": None,  # to flip between unknown/known n clusters for AC
}
BEST_PARAMS = BEST_PARAMS_v1

backbone = "ResNet-50"

CLUSTERERS = [
    "KMeans",
    "AgglomerativeClustering",
    "AgglomerativeClustering",
    "AffinityPropagation",
    "HDBSCAN",
]
print(MODEL2SH)

best_results = {k: [] for k in TEST_DATASETS}
for dummy in [True, False]:
    cmds = []
    latex_table = r"% Results for " + f"{metric_key}, {backbone}" + "\n"
    now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    latex_table += r"% Generated " + now_str + "\n"
    latex_table += r"\label{tab:" + backbone + r"}" + "\n"
    latex_table += r"\resizebox{\textwidth}{!}{%" + "\n"
    latex_table += r"\begin{tabular}{ll" + r"r" * len(TEST_DATASETS) + r"}" + "\n"
    latex_table += r"\toprule" + "\n"
    latex_table += r"& " + f"{'Clusterer':<11s}"
    for dataset in TEST_DATASETS:
        latex_table += r"&" + "{:^15s}".format(DATASET2SH.get(dataset, dataset))
    latex_table += r"\\" + "\n"
    latex_table += r"\toprule" + "\n"
    print(MODEL_GROUPS[backbone])
    for i_group, model in enumerate(list(MODEL_GROUPS[backbone])):
        print(model)
        if i_group > 0:
            latex_table += r"\midrule" + "\n"

        first_agg = True
        for i_clusters, clusterer in enumerate(CLUSTERERS):
            if i_clusters == 0:
                latex_table += (
                    r"\parbox[t]{2mm}{\multirow{5}{*}{\rotatebox[origin=c]{90}{"
                    + MODEL2SH[model]
                    + "}}}"
                )
                latex_table += "\n"
            override_fields = {}
            clusterername = CLUSTERER2SH.get(clusterer, clusterer)
            if first_agg and clusterer == "AgglomerativeClustering":
                first_agg = False
                override_fields = {"aggclust_dist_thresh": None}
                clusterername = "AC  w/ C"
            elif clusterer == "AgglomerativeClustering":
                clusterername = "AC w/o C"
            latex_table += f"& {clusterername:<10s}"
            for i_dataset, dataset in enumerate(TEST_DATASETS):
                latex_table += " &"
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                }
                sdf = select_rows(test_runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                filter2.update(override_fields)
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    print(f"No data for {filter} {filter2}")
                    cmds.append(filter2command(filter, filter2, partition="test"))
                    continue
                if len(sdf) > 1:
                    perf = sdf.iloc[0]["AMI"]
                    if sum(sdf["AMI"] != perf) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter}\nand {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        if dif_cols:
                            for col in dif_cols:
                                print(f"  {col}: {list(sdf[col])}")
                my_val = np.median(sdf[metric_key])
                if dummy:
                    best_results[dataset].append(my_val)
                    continue
                is_best = my_val + eps >= np.max(best_results[dataset])
                if len(best_results[dataset]) > 1:
                    is_secd = my_val + eps >= np.sort(best_results[dataset])[-2]
                else:
                    is_secd = False
                if show_pc:
                    my_val = my_val * 100
                latex_table += " $"
                if is_best:
                    latex_table += r"\tcf{"
                elif is_secd:
                    latex_table += r"\tcs{"
                else:
                    latex_table += "     "
                latex_table += show_fmt.format(my_val)
                latex_table += r"}" if is_best or is_secd else " "
                latex_table += "$"
            latex_table += r" \\" + "\n"
    latex_table += r"\bottomrule" + "\n"
    latex_table += r"\end{tabular}" + "\n"
    latex_table += r"}" + "\n"


if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

print()
print("Done!")
print()
print(f"Here is your results table for {clusterer}:")
print()
print()
print(latex_table)

## Grouping by clusterer

In [None]:
from collections import defaultdict

In [None]:
metric_key = "AMI"  # AMI  num_cluster_pred  silhouette-euclidean_pred  silhouette-og-euclidean_pred
show_pc = True
show_fmt = "{:4.0f}"
highlight_best = True
use_si_num = False
eps = 0.005
override_fields = {
    # "aggclust_dist_thresh": None,  # to flip between unknown/known n clusters for AC
}
BEST_PARAMS = BEST_PARAMS_v1

backbone = "ViT-B"  # "ResNet-50" or "ViT-B"

if metric_key == "num_cluster_pred":
    CLUSTERERS = ["AgglomerativeClustering", "AffinityPropagation", "HDBSCAN"]
    show_pc = False
    show_fmt = "{:4.0f}"
    highlight_best = False
    use_si_num = True
    override_fields = {}
else:
    CLUSTERERS = [
        "KMeans",
        "AgglomerativeClustering",
        "AgglomerativeClustering",
        "AffinityPropagation",
        "HDBSCAN",
    ]
if metric_key.startswith("silhouette"):
    show_pc = False
    show_fmt = "{:5.2f}"

print(MODEL2SH)

best_results = {k: [] for k in TEST_DATASETS}
best_results_grouped = {k: defaultdict(lambda: []) for k in TEST_DATASETS}

for dummy in [True, False]:
    cmds = []
    latex_table = r"% Results for " + f"{metric_key}, {backbone}" + "\n"
    now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    latex_table += r"% Generated " + now_str + "\n"
    label = backbone
    if metric_key == "AMI":
        latex_table += r"\label{tab:" + label + r"}" + "\n"
    label = metric_key.replace("_", "-") + ":" + label
    latex_table += r"\label{tab:" + label + r"}" + "\n"
    latex_table += r"\resizebox{\textwidth}{!}{%" + "\n"
    latex_table += r"\begin{tabular}{ll" + r"r" * len(TEST_DATASETS) + r"}" + "\n"
    latex_table += r"\toprule" + "\n"
    latex_table += r"& " + f"{'Encoder':<11s}"
    for dataset in TEST_DATASETS:
        latex_table += r"&" + "{:^15s}".format(DATASET2SH.get(dataset, dataset))
    latex_table += r"\\" + "\n"
    latex_table += r"\toprule" + "\n"
    print(MODEL_GROUPS[backbone])
    if metric_key == "num_cluster_pred":
        latex_table += r"& Num targets"
        for i_dataset, dataset in enumerate(TEST_DATASETS):
            sdf = select_rows(test_runs_df, {"dataset": dataset}, allow_missing=False)
            sdf = sdf[~pd.isna(sdf["num_cluster_true"])]
            latex_table += r"& "
            latex_table += r"\num{" if use_si_num else r"$"
            latex_table += f"{sdf.iloc[0]['num_cluster_true'].item()}"
            latex_table += r"}" if use_si_num else r"$"
        latex_table += r"\\" + "\n"
        latex_table += r"\toprule" + "\n"
    elif metric_key.endswith("_pred"):
        metric_key2 = metric_key.replace("_pred", "_true")
        clusterername = "G.T."
        latex_table += (
            r"\parbox[t]{2mm}{\multirow{5}{*}{\rotatebox[origin=c]{90}{"
            + clusterername
            + "}}}"
        )
        latex_table += "\n"
        for i_group, model in enumerate(list(MODEL_GROUPS[backbone])):
            latex_table += f"& {MODEL2SH[model]:<10s}"
            for i_dataset, dataset in enumerate(TEST_DATASETS):
                latex_table += " &"
                filter = {"model": model, "dataset": dataset}
                if model == "timm_vit_base_patch16_224.mae":
                    filter["dim_reducer"] = "PCA"
                    filter["pca_variance"] = 0.95
                else:
                    filter["dim_reducer_man"] = "UMAP"
                    filter["ndim_reduced_man"] = 50
                    filter["dim_reducer_man_metric"] = "euclidean"
                sdf = select_rows(test_runs_df, filter, allow_missing=False)
                sdf = sdf[~pd.isna(sdf[metric_key2])]
                my_val = np.nanmedian(sdf[metric_key])
                if sum(sdf[metric_key2] != my_val) > 0:
                    pass
                if dummy:
                    best_results_grouped[dataset][clusterername].append(my_val)
                    continue
                is_best_grp = my_val + eps >= np.max(
                    best_results_grouped[dataset][clusterername]
                )
                latex_table += r"\num{" if use_si_num else r"$"
                latex_table += "     "
                if not highlight_best:
                    pass
                elif is_best_grp:
                    latex_table += r"\tcg{"
                else:
                    latex_table += "     "
                latex_table += show_fmt.format(my_val)
                if highlight_best:
                    latex_table += r"}" if is_best_grp else " "
                latex_table += r"}" if use_si_num else r"$"

            latex_table += r" \\" + "\n"
        latex_table += r"\toprule" + "\n"

    first_agg = True
    for i_clusters, clusterer in enumerate(CLUSTERERS):
        override_fields = {}
        clusterername = CLUSTERER2SH.get(clusterer, clusterer)
        if (
            first_agg
            and clusterer == "AgglomerativeClustering"
            and metric_key != "num_cluster_pred"
        ):
            first_agg = False
            override_fields = {"aggclust_dist_thresh": None}
            clusterername = "AC  w/ C"
        elif clusterer == "AgglomerativeClustering":
            clusterername = "AC w/o C"

        if i_clusters > 0:
            latex_table += r"\midrule" + "\n"

        latex_table += (
            r"\parbox[t]{2mm}{\multirow{5}{*}{\rotatebox[origin=c]{90}{"
            + clusterername
            + "}}}"
        )
        latex_table += "\n"

        for i_group, model in enumerate(list(MODEL_GROUPS[backbone])):
            print(model)

            latex_table += f"& {MODEL2SH[model]:<10s}"
            for i_dataset, dataset in enumerate(TEST_DATASETS):
                latex_table += " &"
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                }
                sdf = select_rows(test_runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                filter2.update(override_fields)
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    print(f"No data for {filter} {filter2}")
                    cmds.append(filter2command(filter, filter2, partition="test"))
                    if not dummy:
                        # latex_table += r"\multicolumn{1}{c}{--}"
                        latex_table += r"   --  "
                    continue
                if len(sdf) > 1:
                    perf = sdf.iloc[0]["AMI"]
                    if sum(sdf["AMI"] != perf) > 0:
                        print()
                        print("More than one result with AMIs:", list(sdf["AMI"]))
                        print(f"for search {filter}\nand {filter2}")
                        dif_cols = find_differing_columns(sdf, config_keys)
                        print(f"columns which differ: {dif_cols}")
                        if dif_cols:
                            for col in dif_cols:
                                print(f"  {col}: {list(sdf[col])}")
                my_val = np.nanmedian(sdf[metric_key])
                if dummy:
                    best_results[dataset].append(my_val)
                    best_results_grouped[dataset][clusterername].append(my_val)
                    continue
                if np.isnan(my_val):
                    latex_table += r"   --  "
                    continue
                is_best = my_val + eps >= np.max(best_results[dataset])
                if len(best_results[dataset]) > 1:
                    is_secd = my_val + eps >= np.sort(best_results[dataset])[-2]
                else:
                    is_secd = False
                is_best_grp = my_val + eps >= np.max(
                    best_results_grouped[dataset][clusterername]
                )
                if show_pc:
                    my_val = my_val * 100
                latex_table += r"\num{" if use_si_num else r"$"
                if not highlight_best:
                    pass
                elif is_best:
                    latex_table += r"\tcf{"
                elif is_secd:
                    latex_table += r"\tcs{"
                else:
                    latex_table += "     "
                if not highlight_best:
                    pass
                elif is_best_grp:
                    latex_table += r"\tcg{"
                else:
                    latex_table += "     "
                latex_table += show_fmt.format(my_val)
                if highlight_best:
                    latex_table += r"}" if is_best or is_secd else " "
                    latex_table += r"}" if is_best_grp else " "
                latex_table += r"}" if use_si_num else r"$"
            latex_table += r" \\" + "\n"
    latex_table += r"\bottomrule" + "\n"
    latex_table += r"\end{tabular}" + "\n"
    latex_table += r"}" + "\n"


if len(cmds) > 0:
    print()
for cmd in cmds:
    print(cmd)

print()
print("Done!")
print()
print(f"Here is your results table for {metric_key}, {backbone}:")
print()
print()
print(latex_table)

## Correlation between AMI and SIlhouette

In [None]:
best_results_grouped

In [None]:
metric_key1 = "AMI"
metric_key2 = "silhouette-euclidean_pred"
BEST_PARAMS = BEST_PARAMS_v1


CLUSTERERS = [
    "KMeans",
    "AffinityPropagation",
    "AgglomerativeClustering",
    "AgglomerativeClustering",
    "HDBSCAN",
]
print(MODEL2SH)

fig, ax = plt.subplots(1, 2, sharey=True, figsize=(5, 3))


colors = [
    "tab:blue",
    "tab:orange",
    "tab:green",
    "tab:red",
    "tab:purple",
    "tab:brown",
    "tab:pink",
    "tab:gray",
    "tab:olive",
    "tab:cyan",
]

correlations = {"ResNet-50": [], "ViT-B": []}
for i_backbone, backbone in enumerate(["ResNet-50", "ViT-B"]):
    my_valx_overall = []
    my_valy_overall = []

    my_valx_method = {clusterer: [] for clusterer in CLUSTERERS}
    my_valy_method = {clusterer: [] for clusterer in CLUSTERERS}
    best_results = {k: [] for k in TEST_DATASETS}

    for i_dataset, dataset in enumerate(TEST_DATASETS):
        my_valx = []
        my_valy = []
        first_agg = True
        for i_clusters, clusterer in enumerate(CLUSTERERS):
            clusterername = clusterer
            if first_agg and clusterer == "AgglomerativeClustering":
                first_agg = False
                override_fields = {"aggclust_dist_thresh": None}
                clusterername = "AC  w/ C"
            elif clusterer == "AgglomerativeClustering":
                override_fields = {}
                clusterername = "AC w/o C"

            for i_group, model in enumerate(list(MODEL_GROUPS[backbone])):
                if i_group == 0:
                    latex_table += (
                        r"\parbox[t]{2mm}{\multirow{5}{*}{\rotatebox[origin=c]{90}{"
                        + clusterername
                        + "}}}"
                    )
                    latex_table += "\n"

                latex_table += f"& {MODEL2SH[model]:<10s}"
                latex_table += " &"
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                }
                sdf = select_rows(test_runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                filter2.update(override_fields)
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    cmds.append(filter2command(filter, filter2, partition="test"))
                    continue
                my_valx.append(np.nanmedian(sdf[metric_key1]))
                my_valy.append(np.nanmedian(sdf[metric_key2]))

                my_valx_method[clusterer].append(np.nanmedian(sdf[metric_key1]))
                my_valy_method[clusterer].append(np.nanmedian(sdf[metric_key2]))

        correlations[backbone].append(np.corrcoef(my_valx, my_valy)[0, 1])

        ax[i_backbone].scatter(
            my_valy,
            my_valx,
            color=colors[i_dataset],
            alpha=0.5,
            label=TEST_DATASETS[i_dataset],
        )
        my_valx_overall.extend(my_valx)
        my_valy_overall.extend(my_valy)
        ax[i_backbone].set_xlabel(r"$S$")
        if i_backbone == 0:
            ax[i_backbone].set_ylabel(metric_key1)
        ax[i_backbone].set_ylim(-0.05, 1.05)
        ax[i_backbone].set_xlim(-1.05, 1.05)
        ax[i_backbone].set_title(
            f"{backbone}\nPCC: {np.corrcoef(my_valx_overall, my_valy_overall)[0,1]:.2f}"
        )


label_fn = lambda c, marker: plt.plot(  # noqa:E731
    [], [], color=c, ls="None", marker=marker, linewidth=6
)[0]
handles = [label_fn(colors[idx], "o") for idx in range(len(TEST_DATASETS))]
data_labels = [DATASET2SH.get(dataset, dataset) for dataset in TEST_DATASETS]

ax[1].legend(handles, data_labels, loc="center left", bbox_to_anchor=(1, 0.5))

print(data_labels)
print(correlations["ResNet-50"], len(correlations["ResNet-50"]))
print(correlations["ViT-B"], len(correlations["ViT-B"]))

fig.savefig("ami_silhouette.pdf", bbox_inches="tight")

In [None]:
metric_key1 = "AMI"
metric_key2 = "silhouette-og-euclidean_pred"
BEST_PARAMS = BEST_PARAMS_v1

CLUSTERERS = [
    "KMeans",
    "AffinityPropagation",
    "AgglomerativeClustering",
    "AgglomerativeClustering",
    "HDBSCAN",
]
print(MODEL2SH)

fig, ax = plt.subplots(1, 2, sharey=True, figsize=(5.5, 3))

colors = [
    "tab:red",
    "tab:blue",
    "tab:orange",
    "tab:green",
    "tab:purple",
    "tab:brown",
    "tab:pink",
    "tab:gray",
    "tab:olive",
    "tab:cyan",
]

correlations = {"ResNet-50": [], "ViT-B": []}
for i_backbone, backbone in enumerate(["ResNet-50", "ViT-B"]):
    my_valx_overall = []
    my_valy_overall = []
    best_results = {k: [] for k in TEST_DATASETS}

    for i_dataset, dataset in enumerate(TEST_DATASETS):
        my_valx = []
        my_valy = []
        first_agg = True
        for i_clusters, clusterer in enumerate(CLUSTERERS):
            clusterername = clusterer
            if first_agg and clusterer == "AgglomerativeClustering":
                first_agg = False
                override_fields = {"aggclust_dist_thresh": None}
                clusterername = "AC  w/ C"
            elif clusterer == "AgglomerativeClustering":
                override_fields = {}
                clusterername = "AC w/o C"

            for i_group, model in enumerate(list(MODEL_GROUPS[backbone])):
                if i_group == 0:
                    latex_table += (
                        r"\parbox[t]{2mm}{\multirow{5}{*}{\rotatebox[origin=c]{90}{"
                        + clusterername
                        + "}}}"
                    )
                    latex_table += "\n"

                latex_table += f"& {MODEL2SH[model]:<10s}"
                latex_table += " &"
                filter = {
                    "model": model,
                    "dataset": dataset,
                    "clusterer": clusterer,
                }
                sdf = select_rows(test_runs_df, filter, allow_missing=False)
                filter2 = dict(DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model])
                filter2 = {k: v for k, v in filter2.items() if k not in filter}
                filter2.update(override_fields)
                sdf = select_rows(sdf, filter2, allow_missing=False)
                if len(sdf) < 1:
                    cmds.append(filter2command(filter, filter2, partition="test"))
                    continue
                my_valx.append(np.nanmedian(sdf[metric_key1]))
                my_valy.append(np.nanmedian(sdf[metric_key2]))

        correlations[backbone].append(np.corrcoef(my_valx, my_valy)[0, 1])

        ax[i_backbone].scatter(
            my_valy,
            my_valx,
            color=colors[i_dataset],
            alpha=0.5,
            s=8,
            label=TEST_DATASETS[i_dataset],
        )
        my_valx_overall.extend(my_valx)
        my_valy_overall.extend(my_valy)

    ax[i_backbone].set_xlabel(r"$S$")
    if i_backbone == 0:
        ax[i_backbone].set_ylabel(metric_key1)
    ax[i_backbone].set_ylim(-0.05, 1.05)
    ax[i_backbone].set_xlim(-1.05, 1.05)
    ax[i_backbone].set_title(backbone)
    my_valx_overall = np.array(my_valx_overall)
    my_valy_overall = np.array(my_valy_overall)
    select = ~(np.isnan(my_valx_overall) | np.isnan(my_valy_overall))
    cor = np.corrcoef(my_valx_overall[select], my_valy_overall[select])
    ax[i_backbone].text(-0.85, 0.95, f"$r={cor[0,1]:.2f}$")


label_fn = lambda c, marker: plt.plot(  # noqa:E731
    [], [], color=c, ls="None", marker=marker, linewidth=6
)[0]
handles = [label_fn(colors[idx], "o") for idx in range(len(TEST_DATASETS))]
data_labels = [DATASET2SH.get(dataset, dataset) for dataset in TEST_DATASETS]

ax[1].legend(handles, data_labels, loc="center left", bbox_to_anchor=(1, 0.5))

print(data_labels)
print(correlations["ResNet-50"], len(correlations["ResNet-50"]))
print(correlations["ViT-B"], len(correlations["ViT-B"]))

fig.savefig(
    f"{metric_key1}_{metric_key2.replace('-euclidean', '')}.pdf", bbox_inches="tight"
)

In [None]:
cor

In [None]:
metric_key1 = "AMI"
metric_key2 = "silhouette-euclidean_pred"
BEST_PARAMS = BEST_PARAMS_v1

CLUSTERERS = [
    "KMeans",
    "AffinityPropagation",
    "AgglomerativeClustering",
    "AgglomerativeClustering",
    "HDBSCAN",
]
print(MODEL2SH)

figenc, axenc = plt.subplots(1, 2, figsize=(6, 2))
figclus, axclus = plt.subplots(1, 2, figsize=(6, 2))

for i_backbone, backbone in enumerate(["ResNet-50", "ViT-B"]):
    result_table = np.zeros(
        (5, len(CLUSTERERS), len(TEST_DATASETS))
    )  # Encoders, clusteres, dataset
    for dummy in [True, False]:
        cmds = []

        for i_group, model in enumerate(list(MODEL_GROUPS[backbone])):
            first_agg = True
            for i_clusters, clusterer in enumerate(CLUSTERERS):
                clusterername = clusterer
                if first_agg and clusterer == "AgglomerativeClustering":
                    first_agg = False
                    override_fields = {"aggclust_dist_thresh": None}
                    clusterername = "Agg  w/ C"
                elif clusterer == "AgglomerativeClustering":
                    override_fields = {}
                    clusterername = "Agg w/o C"

                for i_dataset, dataset in enumerate(TEST_DATASETS):
                    latex_table += " &"
                    filter = {
                        "model": model,
                        "dataset": dataset,
                        "clusterer": clusterer,
                    }
                    sdf = select_rows(test_runs_df, filter, allow_missing=False)
                    filter2 = dict(
                        DEFAULT_PARAMS["all"], **BEST_PARAMS[clusterer][model]
                    )
                    filter2 = {k: v for k, v in filter2.items() if k not in filter}
                    filter2.update(override_fields)
                    sdf = select_rows(sdf, filter2, allow_missing=False)
                    if len(sdf) < 1:
                        cmds.append(filter2command(filter, filter2, partition="test"))
                        result_table[i_group, i_clusters, i_dataset] = -100.0
                        continue
                    result_table[i_group, i_clusters, i_dataset] = np.median(
                        sdf[metric_key1]
                    )

    print(result_table[0])

    print(backbone)
    print(MODEL_GROUPS[backbone])
    CLUSTERERS2 = ["K-Means", "Affinity Prop", "Agg w/ C", "Agg w/o C", "HDBSCAN"]
    colors = ["tab:blue", "tab:orange", "tab:red", "tab:green", "tab:olive", "tab:cyan"]

    encoder_to_color = {}
    cluster_to_color = {
        CLUSTERERS2[idx]: colors[idx] for idx in range(len(CLUSTERERS2))
    }

    for model in list(MODEL_GROUPS[backbone]):
        if model == "resnet50" or model == "vitb16":
            encoder_to_color[model] = colors[0]
        if "mae" in model:
            encoder_to_color[model] = colors[1]
        if "vicreg" in model:
            encoder_to_color[model] = colors[2]
        if "clip" in model:
            encoder_to_color[model] = colors[3]
        if "moco" in model:
            encoder_to_color[model] = colors[4]
        if "dino" in model:
            encoder_to_color[model] = colors[5]

    print(encoder_to_color)
    rank_tmp = np.asarray([1, 2, 3, 4, 5])
    # RANK PER ENCODER - go through each dataset, look at each clusterer,
    # and determine the rank of each encoder in that setting
    print(list(MODEL_GROUPS[backbone]))
    ranks_encoders = np.zeros((5, len(CLUSTERERS), len(TEST_DATASETS)))
    for i_dataset in range(len(TEST_DATASETS)):
        for i_clusters in range(len(CLUSTERERS)):
            cluster_data = result_table[:, i_clusters, i_dataset]
            rank = np.argsort(cluster_data)[::-1]
            ranks_encoders[:, i_clusters, i_dataset] = rank_tmp[rank.argsort()]
    mean_rank_encoders = np.mean(ranks_encoders, axis=(1, 2))
    std_rank_encoders = np.std(ranks_encoders, axis=(1, 2))
    order = [
        (
            list(MODEL_GROUPS[backbone])[idx],
            mean_rank_encoders[idx],
            std_rank_encoders[idx],
        )
        for idx in np.argsort(mean_rank_encoders)
    ]

    for idx, model in enumerate(order[::-1]):
        axenc[i_backbone].barh(
            idx,
            model[1],
            xerr=model[2],
            align="center",
            alpha=0.6,
            ecolor="black",
            color=encoder_to_color[model[0]],
            capsize=2,
            zorder=10,
        )

    axenc[i_backbone].set_yticks([])
    axenc[i_backbone].set_yticklabels([])
    axenc[i_backbone].set_xticks([1, 2, 3, 4, 5])
    axenc[i_backbone].set_xticklabels([1, 2, 3, 4, 5])
    axenc[i_backbone].xaxis.grid(True, zorder=1, alpha=0.5)
    axenc[i_backbone].set_title(f"{backbone}")

    # RANK PER CLUSTERER - go through each dataset, look at each encoder,
    # and determine the rank of each clusterer in that setting

    print(CLUSTERERS2)
    ranks_clusterers = np.zeros((5, len(CLUSTERERS2), len(TEST_DATASETS)))
    for i_dataset in range(len(TEST_DATASETS)):
        for i_encoder in range(len(list(MODEL_GROUPS[backbone]))):
            encoder_data = result_table[i_encoder, :, i_dataset]
            rank = np.argsort(encoder_data)[::-1]
            ranks_clusterers[i_encoder, :, i_dataset] = rank_tmp[rank.argsort()]
    mean_rank_clusters = np.mean(ranks_clusterers, axis=(0, 2))
    std_rank_clusters = np.std(ranks_clusterers, axis=(0, 2))
    order = [
        (CLUSTERERS2[idx], mean_rank_clusters[idx], std_rank_clusters[idx])
        for idx in np.argsort(mean_rank_clusters)
    ]

    for idx, model in enumerate(order[::-1]):
        axclus[i_backbone].barh(
            idx,
            model[1],
            xerr=model[2],
            align="center",
            alpha=0.6,
            ecolor="black",
            color=cluster_to_color[model[0]],
            capsize=2,
            zorder=10,
        )

    axclus[i_backbone].set_yticks([])
    axclus[i_backbone].set_yticklabels([])
    axclus[i_backbone].set_xticks([1, 2, 3, 4, 5])
    axclus[i_backbone].set_xticklabels([1, 2, 3, 4, 5])
    axclus[i_backbone].xaxis.grid(True, zorder=1, alpha=0.5)
    axclus[i_backbone].set_title(f"{backbone}")

    axclus[i_backbone].set_xlabel("Rank")
    axenc[i_backbone].set_xlabel("Rank")

    print(order)


encoder_to_color["vicreg_resnet50"] = colors[2]

label_fn = lambda c, ls: plt.plot([], [], color=c, ls=ls, linewidth=3)[0]  # noqa:E731
handles_clus = [label_fn(cluster_to_color[idx], "-") for idx in CLUSTERERS2]
handles_enc = [
    label_fn(encoder_to_color[idx], "-")
    for idx in list(MODEL_GROUPS[backbone]) + ["vicreg_resnet50"]
]

axenc[1].legend(
    handles_enc,
    [MODEL2SH[x] for x in list(MODEL_GROUPS[backbone]) + ["vicreg_resnet50"]],
    loc="center left",
    bbox_to_anchor=(1, 0.5),
)
axclus[1].legend(handles_clus, CLUSTERERS2, loc="center left", bbox_to_anchor=(1, 0.5))

figenc.savefig("ranking_enc.pdf", bbox_inches="tight")
figclus.savefig("ranking_clus.pdf", bbox_inches="tight")