In [None]:
from itertools import combinations
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import homogeneity_score
from data import get_data_w_labels, DATASETS_2D, DATASETS_2D_Ks, get_data_wo_labels
from spectral import Spectral
from utils import plot_clusters, save_plots

plots = []

In [None]:
def cost(x, ids):
    clusters = ((xi for i, xi in enumerate(x) if ids[i] == cid) for cid in set(ids))
    inner_costs = (sum(np.linalg.norm(xi - xj) for xi, xj in combinations(cluster, 2)) for cluster in clusters)
    return sum(inner_costs)

In [None]:
def handle_2D_dataset(x, y, max_k=10):
    sqr = int(max_k ** .5)
    fig = plt.figure(figsize=(30, 30 * ((sqr + 2) / sqr)))
    widths = sqr * [10]
    heights = sqr * [10] + [10] + [15]
    gs = fig.add_gridspec(ncols=sqr, nrows=sqr + 2, width_ratios=widths, height_ratios=heights)

    costs, homo = [], []
    for k in range(2, max_k + 1):
        ids = Spectral().cluster(x, k, 1)
        costs.append(cost(x, ids))
        homo.append(homogeneity_score(y, ids))

        ax = fig.add_subplot(gs[(k - 2) // sqr, (k - 2) % sqr], aspect='equal')
        plot_clusters(ax, x, ids)
        ax.set_title(f'{k} clusters', fontsize=30)
        ax.label_outer()

    ax_err, ax_homo = fig.add_subplot(gs[-2, :(sqr + 1) // 2], ), fig.add_subplot(gs[-2, (sqr + 1) // 2:])
    ax_err.set_title(f'Number of clusters vs cost', fontsize=30)
    ax_err.plot(range(2, max_k + 1), costs, 'bx-', ms=10, mec='k')
    ax_err.axvline(len(set(y)), c='red')

    ax_homo.set_title(f'Number of clusters vs homogenity score', fontsize=30)
    ax_homo.plot(range(2, max_k + 1), homo, 'bx-', ms=10, mec='k')
    ax_homo.axvline(len(set(y)), c='red')

    ax_true = fig.add_subplot(gs[-1, :], aspect='equal')
    ax_true.set_title(f'True clusters', fontsize=30)
    plot_clusters(ax_true, x, y)

    fig.tight_layout(pad=3.0)
    plots.append(fig)


for dataset, max_k in zip(DATASETS_2D, DATASETS_2D_Ks):
    x, y = get_data_w_labels(dataset)
    y = y.astype(int).flatten() - 1
    handle_2D_dataset(x, y, max_k)
    plt.show()



In [None]:
def handle_rp_dataset(x, y):
    ids = Spectral().cluster(x, 2, 1.)

    print(f'Cost after spectral: {cost(x, ids)}')
    print(f'Cost from data:      {cost(x, y)}')
    print(f'Homogenity score:    {homogeneity_score(y, ids)}')


x_rp, y_rp = get_data_w_labels('data/rp.data')
y_rp = np.array(y_rp, dtype=int).flatten()
y_rp[y_rp == 2] = 0
y_rp[y_rp == 4] = 1
handle_rp_dataset(x_rp, y_rp)

In [None]:
def handle_9D_dataset(x, max_k=20):
    costs = []
    for k in range(2, max_k + 1):
        ids = Spectral().cluster(x, k, 1.)
        costs.append(cost(x, ids))

    fig, ax_err = plt.subplots(figsize=(10, 10))
    plots.append(fig)
    ax_err.set_title(f'Number of clusters vs cost')
    ax_err.plot(range(2, max_k + 1), costs, 'bx-', ms=10, mec='k')


x_9d = get_data_wo_labels('data/dane_9D.txt')
handle_9D_dataset(x_9d)

In [None]:
save_plots(plots, 'spectral.pdf')
