In [1]:
import pandas as pd
from silhouette_upper_bound import upper_bound, upper_bound_samples, upper_bound_macro_silhouette
import numpy as np
import matplotlib.pyplot as plt 
import pickle
from sklearn.metrics import silhouette_score, adjusted_rand_score, silhouette_samples, adjusted_mutual_info_score
from collections import Counter
import kmedoids
from sklearn.preprocessing import StandardScaler, RobustScaler, normalize
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import squareform, pdist
import seaborn as sns
from matplotlib.ticker import MultipleLocator
from tqdm import tqdm
from pathlib import Path
from scipy.io import arff
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import time

In [2]:
# Distance metric = L1

In [3]:
dataset_name = "aloi-hsb-14x6x6"
path = "data/aloi/" + dataset_name + ".csv"
N_FEATURES = 504

In [4]:
# Load whitespace-separated data
df = pd.read_csv(path, 
                 sep=r"\s+",      # whitespace
                 header=None,     # no header row
                 engine="python") # needed because of regex separator

print(df.shape)
df.head()

(110250, 506)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,496,497,498,499,500,501,502,503,504,505
0,0.00231,0.0,0.0,0.0,0.0,0.0,0.024204,7e-06,0.0,0.0,...,0.000127,0.000448,5.4e-05,0.000979,0.000834,0.003366,0.011929,0.012741,"""img1""","""1/1_i110.png"""
1,0.002697,0.0,0.0,0.0,0.0,0.0,0.024419,9e-06,0.0,0.0,...,0.000145,0.000463,9.3e-05,0.00073,0.000956,0.003872,0.012691,0.009307,"""img1""","""1/1_i120.png"""
2,0.003059,0.0,0.0,0.0,0.0,0.0,0.028411,1.8e-05,0.0,0.0,...,0.000156,0.000513,1.8e-05,0.000714,0.000906,0.003658,0.012037,0.00812,"""img1""","""1/1_i130.png"""
3,0.003852,2e-06,0.0,0.0,0.0,2e-06,0.036011,4.3e-05,1.8e-05,1.4e-05,...,0.000111,0.000961,3.2e-05,0.00099,0.000445,0.001614,0.004731,0.020402,"""img1""","""1/1_i140.png"""
4,0.00423,0.0,0.0,0.0,0.0,0.0,0.02979,5.2e-05,3.6e-05,9e-06,...,0.000301,0.000434,4.3e-05,0.000488,0.001069,0.005098,0.010948,0.003913,"""img1""","""1/1_i150.png"""


In [5]:
df["class"] = df.iloc[:, -1].str.extract(r"(\d+)", expand=False)
df = df[df["class"].notna()]   # drop rows with missing class
df["class"] = df["class"].astype(int)

In [6]:
df_subset = (
    df.groupby("class", group_keys=False)
      .apply(lambda g: g.sample(n=40, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda g: g.sample(n=40, random_state=42))


In [7]:
df_subset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,class
0,0.010166,0.000084,0.000045,0.000079,0.000129,0.001203,0.034017,0.000197,0.000805,0.001309,...,0.000014,0.000061,0.001341,0.003411,0.009531,0.002423,0.000007,"""img1""","""1/1_r280.png""",1
1,0.010283,0.000054,0.000068,0.000059,0.000061,0.001610,0.032213,0.000167,0.000158,0.000174,...,0.000411,0.000050,0.000251,0.001487,0.007643,0.007541,0.000925,"""img1""","""1/1_i230.png""",1
2,0.004230,0.000000,0.000000,0.000000,0.000000,0.000000,0.029790,0.000052,0.000036,0.000009,...,0.000434,0.000043,0.000488,0.001069,0.005098,0.010948,0.003913,"""img1""","""1/1_i150.png""",1
3,0.010306,0.000093,0.000070,0.000079,0.000090,0.001072,0.032052,0.000278,0.000963,0.000726,...,0.000059,0.000086,0.000554,0.002062,0.009861,0.005012,0.000016,"""img1""","""1/1_r305.png""",1
4,0.011635,0.000131,0.000149,0.000156,0.000104,0.001216,0.036904,0.000416,0.000450,0.000877,...,0.000043,0.000084,0.001225,0.005269,0.016473,0.000328,0.000045,"""img1""","""1/1_r210.png""",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.006664,0.000054,0.000002,0.000005,0.000000,0.000208,0.022334,0.000136,0.000025,0.000036,...,0.002369,0.000032,0.000981,0.008807,0.006856,0.004467,0.001347,"""img1000""","""1000/1000_r230.png""",1000
39996,0.008812,0.000016,0.000002,0.000014,0.000000,0.000235,0.036020,0.000294,0.000027,0.000038,...,0.004239,0.000084,0.001429,0.003187,0.003780,0.004006,0.006510,"""img1000""","""1000/1000_l5c3.png""",1000
39997,0.006298,0.000007,0.000000,0.000000,0.000000,0.000034,0.021091,0.000086,0.000047,0.000050,...,0.001711,0.000041,0.002573,0.008027,0.004173,0.001374,0.001573,"""img1000""","""1000/1000_r10.png""",1000
39998,0.003902,0.000000,0.000000,0.000000,0.000000,0.000032,0.021921,0.000018,0.000002,0.000002,...,0.001732,0.000027,0.001431,0.009958,0.011784,0.007299,0.003248,"""img1000""","""1000/1000_i150.png""",1000


In [8]:
y = df_subset["class"]
X = df_subset.iloc[:, 0:N_FEATURES]
X = X.to_numpy(dtype='float32')
print(X.shape, X.dtype)

(40000, 504) float32


In [9]:
D = np.load(f"arrays/aloi-{dataset_name}-D.npy")

In [10]:
print(D.shape, D.dtype)

(40000, 40000) float32


In [11]:
start = time.perf_counter()
ub = upper_bound(D) 
print(f"RT = {time.perf_counter() - start}")

RT = 229.25150637499996


In [12]:
ub

np.float64(0.9005815737721546)

### dynmsc for K in range 2 to 15

In [13]:
start = time.perf_counter()
cluster_labels = (kmedoids.dynmsc(diss=D, medoids=15, random_state=42).labels + 1) 
print(f"RT = {time.perf_counter() - start}")

RT = 554.65869075


In [14]:
cluster_sizes = list(Counter(cluster_labels).values())
min_cluster_size = min(cluster_sizes)
print(f"Min cluster size = {min_cluster_size}")
print(f"K = {len(cluster_sizes)}")
# silhouette samples 
silh_samples = silhouette_samples(X=D, labels=cluster_labels, metric='precomputed')
# ASW 
asw = np.mean(silh_samples)
print(f"ASW = {asw}")
print(f"ub = {ub}")
print(f"WCRE = {(ub - asw)/ub}")

Min cluster size = 38
K = 2
ASW = 0.4702458381652832
ub = 0.9005815737721546
WCRE = 0.4778420391218725


In [15]:
start = time.perf_counter()
uba = upper_bound(D, min_cluster_size) 
print(f"RT = {time.perf_counter() - start}")
print(f"uba = {uba}")
print(f"WCRE = {(uba - asw)/uba}")

RT = 221.87945583400005
uba = 0.751921238653854
WCRE = 0.374607586551016


In [16]:
# AMI and ARI 
ari = adjusted_rand_score(cluster_labels, y)
ami = adjusted_mutual_info_score(cluster_labels, y)

print(f"Adjusted Rand Index vs. true labels: {ari:.3f}")
print(f"Adjusted Mutual Info vs. true labels: {ami:.3f}")

Adjusted Rand Index vs. true labels: 0.000
Adjusted Mutual Info vs. true labels: 0.001


### fastmsc for K = 1000

In [17]:
start = time.perf_counter()
cluster_labels = (kmedoids.fastmsc(diss=D, medoids=1000, random_state=42).labels + 1) 
print(f"RT = {time.perf_counter() - start}")

RT = 1075.0374202079997


In [18]:
cluster_sizes = list(Counter(cluster_labels).values())
min_cluster_size = min(cluster_sizes)
print(f"Min cluster size = {min_cluster_size}")
print(f"K = {len(cluster_sizes)}")
# silhouette samples 
silh_samples = silhouette_samples(X=D, labels=cluster_labels, metric='precomputed')
# ASW 
asw = np.mean(silh_samples)
print(f"ASW = {asw}")
print(f"ub = {ub}")
print(f"WCRE = {(ub - asw)/ub}")

Min cluster size = 2
K = 1000
ASW = 0.14187544584274292
ub = 0.9005815737721546
WCRE = 0.8424624154273035


In [19]:
start = time.perf_counter()
uba = upper_bound(D, min_cluster_size) 
print(f"RT = {time.perf_counter() - start}")
print(f"uba = {uba}")
print(f"WCRE = {(uba - asw)/uba}")

RT = 257.9209444580001
uba = 0.9004619498705162
WCRE = 0.8424414869910448


In [20]:
# AMI and ARI 
ari = adjusted_rand_score(cluster_labels, y)
ami = adjusted_mutual_info_score(cluster_labels, y)

print(f"Adjusted Rand Index vs. true labels: {ari:.3f}")
print(f"Adjusted Mutual Info vs. true labels: {ami:.3f}")

Adjusted Rand Index vs. true labels: 0.292
Adjusted Mutual Info vs. true labels: 0.589


### n classes = 2

In [342]:
np.random.seed(872)
classes = df_subset["class"].unique()
n_classes = 2
chosen = np.random.choice(classes, size=n_classes, replace=False)
# Filter dataframe to those two classes
df_subset_2 = df_subset[df_subset["class"].isin(chosen)]

df_subset_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,class
6160,0.018851,0.000484,0.000409,0.000439,0.000400,0.000353,0.026541,0.000961,0.001408,0.001361,...,0.0,0.000018,0.000000,0.000000,0.0,0.0,0.0,"""img155""","""155/155_r280.png""",155
6161,0.022678,0.000552,0.000477,0.000436,0.000626,0.000705,0.028555,0.000983,0.001234,0.001444,...,0.0,0.000038,0.000002,0.000000,0.0,0.0,0.0,"""img155""","""155/155_i230.png""",155
6162,0.009273,0.000072,0.000032,0.000023,0.000009,0.000075,0.039228,0.000764,0.000513,0.000477,...,0.0,0.000014,0.000000,0.000000,0.0,0.0,0.0,"""img155""","""155/155_i150.png""",155
6163,0.018765,0.000486,0.000515,0.000475,0.000382,0.000249,0.024762,0.001001,0.001456,0.001259,...,0.0,0.000005,0.000002,0.000000,0.0,0.0,0.0,"""img155""","""155/155_r305.png""",155
6164,0.020234,0.000520,0.000436,0.000545,0.000470,0.000418,0.026365,0.001198,0.001693,0.001293,...,0.0,0.000018,0.000007,0.000000,0.0,0.0,0.0,"""img155""","""155/155_r210.png""",155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15555,0.015177,0.000009,0.000000,0.000000,0.000000,0.000176,0.039445,0.000029,0.000018,0.000018,...,0.0,0.000075,0.000000,0.000000,0.0,0.0,0.0,"""img389""","""389/389_r230.png""",389
15556,0.026788,0.000018,0.000000,0.000000,0.000002,0.000402,0.043263,0.000099,0.000118,0.000063,...,0.0,0.000014,0.000002,0.000000,0.0,0.0,0.0,"""img389""","""389/389_l5c3.png""",389
15557,0.014992,0.000018,0.000002,0.000007,0.000002,0.000109,0.041502,0.000086,0.000029,0.000014,...,0.0,0.000007,0.000000,0.000000,0.0,0.0,0.0,"""img389""","""389/389_r10.png""",389
15558,0.006085,0.000000,0.000002,0.000000,0.000002,0.000093,0.047058,0.000027,0.000029,0.000018,...,0.0,0.000018,0.000016,0.000011,0.0,0.0,0.0,"""img389""","""389/389_i150.png""",389


In [343]:
y2 = df_subset_2["class"]
X2 = df_subset_2.iloc[:, 0:N_FEATURES]
X2 = X2.to_numpy()
np.save("arrays/aloi504-2classes.npy", X2)
print(X2.shape, X2.dtype)
D2 = squareform(pdist(X2, metric="cityblock"))
ub = upper_bound(D2)
cluster_labels = (kmedoids.pamsil(diss=D2, medoids=n_classes, random_state=42).labels + 1) 

(80, 504) float64


In [344]:
cluster_sizes = list(Counter(cluster_labels).values())
min_cluster_size = min(cluster_sizes)
print(f"Min cluster size = {min_cluster_size}")
print(f"K = {len(cluster_sizes)}")
# silhouette samples 
silh_samples = silhouette_samples(X=D2, labels=cluster_labels, metric='precomputed')
# ASW 
asw = np.mean(silh_samples)
print(f"ASW = {asw}")
print(f"ub = {ub}")
print(f"WCRE = {(ub - asw)/ub}")

Min cluster size = 37
K = 2
ASW = 0.6520424278115022
ub = 0.8445522740015754
WCRE = 0.227943079565628


In [345]:
uba = upper_bound(D2, min_cluster_size)
print(f"uba = {uba}\nwcre = {(uba - asw)/uba}")

uba = 0.7025223236250574
wcre = 0.07185521956523157


In [310]:
# AMI and ARI 
ari = adjusted_rand_score(cluster_labels, y2)
ami = adjusted_mutual_info_score(cluster_labels, y2)

print(f"Adjusted Rand Index vs. true labels: {ari:.3f}")
print(f"Adjusted Mutual Info vs. true labels: {ami:.3f}")

Adjusted Rand Index vs. true labels: 0.854
Adjusted Mutual Info vs. true labels: 0.804


### n classes = 5

In [346]:
np.random.seed(25)
classes = df_subset["class"].unique()
n_classes = 5
chosen = np.random.choice(classes, size=n_classes, replace=False)
# Filter dataframe to those two classes
df_subset_2 = df_subset[df_subset["class"].isin(chosen)]

df_subset_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,class
1960,0.012526,0.000000,0.000000,0.000000,0.000000,0.000000,0.025232,0.000000,0.000000,0.000000,...,0.000000,0.000005,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r280.png""",50
1961,0.013319,0.000000,0.000000,0.000000,0.000000,0.000000,0.026980,0.000000,0.000000,0.000000,...,0.000000,0.000002,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_i230.png""",50
1962,0.005561,0.000000,0.000000,0.000000,0.000000,0.000000,0.028139,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_i150.png""",50
1963,0.013048,0.000000,0.000000,0.000000,0.000000,0.000000,0.026399,0.000005,0.000000,0.000000,...,0.000000,0.000007,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r305.png""",50
1964,0.012858,0.000000,0.000000,0.000000,0.000000,0.000000,0.024920,0.000005,0.000000,0.000000,...,0.000000,0.000034,0.000000,0.000000,0.000000,0.000000,0.000000,"""img50""","""50/50_r210.png""",50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31035,0.009336,0.000009,0.000000,0.000000,0.000014,0.003696,0.021346,0.000120,0.000086,0.000061,...,0.000326,0.000088,0.000348,0.002430,0.009002,0.007801,0.001185,"""img776""","""776/776_r230.png""",776
31036,0.012408,0.000038,0.000029,0.000020,0.000032,0.011061,0.037557,0.000194,0.000149,0.000079,...,0.003228,0.000068,0.000298,0.001313,0.004012,0.010503,0.020374,"""img776""","""776/776_l5c3.png""",776
31037,0.010197,0.000038,0.000052,0.000090,0.000086,0.003899,0.022174,0.000095,0.000104,0.000115,...,0.000882,0.000054,0.001754,0.008466,0.012182,0.007623,0.000439,"""img776""","""776/776_r10.png""",776
31038,0.004914,0.000023,0.000020,0.000009,0.000018,0.000215,0.022357,0.000120,0.000068,0.000066,...,0.000703,0.000072,0.000771,0.004406,0.007724,0.011113,0.010234,"""img776""","""776/776_i150.png""",776


In [347]:
y2 = df_subset_2["class"]
X2 = df_subset_2.iloc[:, 0:N_FEATURES]
X2 = X2.to_numpy()
np.save("arrays/aloi504-5classes.npy", X2)
print(X2.shape, X2.dtype)
D2 = squareform(pdist(X2, metric="cityblock"))
ub = upper_bound(D2)
cluster_labels = (kmedoids.pamsil(diss=D2, medoids=5, random_state=42).labels + 1) 

(200, 504) float64


In [348]:
cluster_sizes = list(Counter(cluster_labels).values())
min_cluster_size = min(cluster_sizes)
print(f"Min cluster size = {min_cluster_size}")
print(f"K = {len(cluster_sizes)}")
# silhouette samples 
silh_samples = silhouette_samples(X=D2, labels=cluster_labels, metric='precomputed')
# ASW 
asw = np.mean(silh_samples)
print(f"ASW = {asw}")
print(f"ub = {ub}")
print(f"WCRE = {(ub - asw)/ub}")
print(cluster_sizes)

Min cluster size = 9
K = 5
ASW = 0.47023710372217864
ub = 0.8460500338049767
WCRE = 0.4441970510805828
[74, 9, 47, 36, 34]


In [349]:
uba = upper_bound(D2, min_cluster_size)
print(f"uba = {uba}\nwcre = {(uba - asw)/uba}")

uba = 0.7478384591964691
wcre = 0.3712049735614896


In [322]:
# AMI and ARI 
ari = adjusted_rand_score(cluster_labels, y2)
ami = adjusted_mutual_info_score(cluster_labels, y2)

print(f"Adjusted Rand Index vs. true labels: {ari:.3f}")
print(f"Adjusted Mutual Info vs. true labels: {ami:.3f}")

Adjusted Rand Index vs. true labels: 0.652
Adjusted Mutual Info vs. true labels: 0.718


In [328]:
def save_results(path, diss_matrix, k_range = range(2, 16), random_state = 42):

    if Path(path).exists():
        print("Path exists. Aborting.")
        return None 

    results = {}

    list_n_clusters = []
    list_cluster_labels = []
    list_cluster_sizes = []
    list_min_cluster_size = []
    list_silh_samples = []
    list_asw = []
    list_ub_asw = []
    list_ub_asw_min_cluster_size = []

    for k in tqdm(k_range):

        # cluster labels
        cluster_labels = (kmedoids.pamsil(diss=diss_matrix, medoids=k, random_state=random_state).labels + 1)

        # cluster sizes
        cluster_sizes = list(Counter(cluster_labels).values())
        min_cluster_size = min(cluster_sizes)

        # silhouette samples 
        silh_samples = silhouette_samples(X=diss_matrix, labels=cluster_labels, metric='precomputed')

        # ASW 
        asw = np.mean(silh_samples)
        ub_asw = upper_bound(diss_matrix)
        ub_asw_min_cluster_size = upper_bound(diss_matrix, m=min_cluster_size)

        print(ub_asw)

        list_n_clusters.append(k)
        list_cluster_labels.append(cluster_labels)
        list_cluster_sizes.append(cluster_sizes)
        list_min_cluster_size.append(min_cluster_size)
        list_silh_samples.append(silh_samples)
        list_asw.append(asw)
        list_ub_asw.append(ub_asw)
        list_ub_asw_min_cluster_size.append(ub_asw_min_cluster_size)
    
    results = {
        "n_clusters": list_n_clusters,
        "cluster_labels": list_cluster_labels,
        "cluster_sizes": list_cluster_sizes,
        "min_cluster_size": list_min_cluster_size,
        "silh_samples": list_silh_samples,
        "asw": list_asw,
        "ub_asw": list_ub_asw,
        "ub_asw_min_cluster_size": list_ub_asw_min_cluster_size,
    }
    
    # Save to pickle 
    pd.DataFrame.from_dict(results).to_pickle(path)


In [332]:
# save results 
save_results(path = "results/aloi504-2classes.pkl", diss_matrix=D2, k_range=range(2,3))

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 74.93it/s]

0.8445522740015754



