# Lloyd's Heuristic

#### Libraries

In [1]:
from importlib import import_module

from src import init, metrics
from src.algorithms import lloyd

#### Experiment configuration

In [2]:
DATASET_MODULE = 'adult'
DATASET_CONFIG = 'data/datasets/Adult_race/adult.yaml'
N_CLUSTERS = 5
RANDOM_STATE = 0
INIT_METHOD = 'kmeans_plusplus'

#### Load dataset

In [3]:
dataset_module = import_module(f"src.datasets.{DATASET_MODULE}")
dataset = dataset_module.load(yamlpath=DATASET_CONFIG)
X = dataset['X']
s = dataset['s']

Loading dataset configuration from 'data/datasets/Adult_race/adult.yaml'
Loading processed Adult dataset (Adult_race) from 'data/datasets/Adult_race/adult.csv'
  shape: (46033, 28)
╭────────────┬───────────┬──────────────────┬───────────────────────┬────────────────────╮
│    dataset │   samples │   dimensionality │   sensitive attribute │   sensitive groups │
├────────────┼───────────┼──────────────────┼───────────────────────┼────────────────────┤
│ Adult_race │     46033 │               26 │                  race │                  5 │
╰────────────┴───────────┴──────────────────┴───────────────────────┴────────────────────╯


#### Initialise cluster centroids

In [4]:
init_method = getattr(import_module("src.init"), INIT_METHOD)
init_centroids = init_method(X=X, n_clusters=N_CLUSTERS, random_state=RANDOM_STATE)
print(f"shape: {init_centroids.shape}")

shape: (5, 26)


#### Run Lloyd's heuristic

In [5]:
c, centroids = lloyd.run(n_clusters=N_CLUSTERS, X=X, init_centroids=init_centroids)

Configuration:
╭─────────────┬──────────────┬────────────┬────────╮
│   algorithm │   n_clusters │   max_iter │    tol │
├─────────────┼──────────────┼────────────┼────────┤
│       Lloyd │            5 │        200 │ 0.0001 │
╰─────────────┴──────────────┴────────────┴────────╯
Initialising centroids
Running algorithm
╭──────┬───────────────┬───────────────┬────────────────╮
│ iter │     objective │ reassignments │ centroid shift │
├──────┼───────────────┼───────────────┼────────────────┤
│    1 │  133509.77497 │         46033 │              - │
│    2 │   81471.81227 │          1749 │    3.56439e+00 │
│    3 │   80988.55106 │           436 │    2.87519e-01 │
│    4 │   80942.15572 │           163 │    6.74017e-02 │
│    5 │   80934.72284 │            77 │    2.49407e-02 │
│    6 │   80933.27193 │            15 │    1.20275e-02 │
│    7 │   80933.20252 │             7 │    2.55795e-03 │
│    8 │   80933.19491 │             1 │    7.01596e-04 │
│    9 │   80933.19353 │             1 │ 

#### Evaluate

In [6]:
_ = metrics.evaluate(X=X, s=s, c=c, centroids=centroids)

Evaluating against all metrics
╭───────────────────────────┬───────────╮
│ average cluster disparity │ 0.0169522 │
│ k-means objective         │ 1.75816   │
│ fair centroid objective   │ 2.82944   │
│ fair k-means objective    │ 2.17318   │
╰───────────────────────────┴───────────╯
