In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.metrics.cluster import adjusted_rand_score, rand_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from time import time
import networkx as nx

# import warnings filter
from warnings import simplefilter
# ignore all future warnings

In [2]:
def get_data():
    data = np.loadtxt('./semeion.data', dtype=np.int8)
    return data[:, :256], data[:, 256:]


def one_hot_decode(y: np.array):
    return np.argmax(y, axis=1)


def get_data_transformed():
    data = np.loadtxt('./semeion.data', dtype=np.int8)
    return data[:, :256], one_hot_decode(data[:, 256:])


In [3]:
class NoDR:
    def __init__(self, n_components):
        pass
    def fit_transform(self, X):
        return X.copy()


In [4]:
simplefilter(action='ignore', category=FutureWarning)
ks = range(5,16)
ds = (2,64,128, 256)

In [5]:
def plotimg(row: np.array):
    img = row.reshape((16,16))
    plt.imshow(img)
    plt.show()

In [6]:
def plot2D(X, y, title):
    'Plot first 2 columns and color according to labels'
    plt.scatter(X[:,0], X[:,1], c=y)
    plt.title(title)
    plt.show()
    plt.clf()

In [7]:
def print_rand(y_true, y_pred):
    'Print and return the Rand Indexes'
    ari = adjusted_rand_score(y_true, y_pred)
    ri = rand_score(y_true, y_pred)
    print('adjusted_rand_score', round(ari * 100, 2), '%')
    print('rand_score', round(ri * 100, 2), '%', end='\n\n')
    return ari, ri

In [8]:
class N_Cut():
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters


    def __str__(self):
        return 'NCUT'


    def fit(self, X, verbose=False):
        self.n_samples, self.n_features = X.shape

        A = rbf_kernel(X)

        np.fill_diagonal(A, 0.0)

        G = nx.from_numpy_matrix(A)
        L = nx.normalized_laplacian_matrix(G)
        w = np.linalg.eigvals(L.A)
        if verbose:
         print("Largest eigenvalue:", max(w))
         print("Smallest eigenvalue:", min(w))
         print(f"first k={self.n_clusters} nonzero eigenvalues:", w[1:self.n_clusters+1])

        w, v = np.linalg.eig(L.A)

        if not abs(w[0]) < 1e-10:
            raise ValueError(f'First eigenvalue is {w[0]}.\nMust be close to zero.')

        U = v[:, 1:self.n_clusters+1]

        kmeans = KMeans(n_clusters=self.n_clusters).fit(U)

        self.labels_ = kmeans.labels_

In [9]:
def NCut_Result(p=False):

    columns = ('clustering_model', 'dim_red_model', 'd', 'k', 'time', 'ARI', 'RI')
    rows = []

    X, y = get_data_transformed()
    for k in ks:
        for d in ds:
            if d != 256:
                embedding_model = PCA
            else:
                embedding_model = NoDR
            start_time = time()
            embedding = embedding_model(n_components=d)  # DR model
            X_transformed = embedding.fit_transform(X)
            model = N_Cut(k)  # clustering model
            model.fit(X_transformed)
            elapsed = time() - start_time
            title = f'Results for {model} on {d}-{embedding_model.__name__} - k={k} ({elapsed:.02f}s)'
            print(title)
            ari, ri = print_rand(y, model.labels_)
            if p: plot2D(X_transformed, model.labels_, title)
            result = (str(model), embedding_model.__name__, d, k, elapsed, ari, ri)
            rows.append(result)
    df = pd.DataFrame(data=rows, columns=columns)
    print(df)
    df.to_csv('NCut.csv', index=False)

In [10]:
def results_analysis():
    df = pd.read_csv('NCut.csv')

    dfg_k = df.groupby('k').mean()[['time','ARI','RI']]
    print('averages by number of cluster:')
    print(dfg_k.sort_values('ARI', ascending=False))
    print()

    dfg_dr = df.groupby('dim_red_model').mean()[['time','ARI','RI']]
    print('averages by DR model:')
    print(dfg_dr.sort_values('ARI', ascending=False))
    print()


    print('best ARI achieved by:')
    print(df.sort_values('ARI', ascending=False).head(3))


In [11]:
def main():

    NCut_Result()
    results_analysis()



In [12]:
if __name__ == '__main__':
    main()

Results for NCUT on 2-PCA - k=5 (13.86s)
adjusted_rand_score 19.9 %
rand_score 76.78 %

Results for NCUT on 64-PCA - k=5 (12.65s)
adjusted_rand_score 24.25 %
rand_score 79.8 %

Results for NCUT on 128-PCA - k=5 (12.15s)
adjusted_rand_score 23.75 %
rand_score 79.68 %

Results for NCUT on 256-NoDR - k=5 (12.02s)
adjusted_rand_score 23.66 %
rand_score 79.67 %

Results for NCUT on 2-PCA - k=6 (12.14s)
adjusted_rand_score 21.37 %
rand_score 79.51 %

Results for NCUT on 64-PCA - k=6 (12.31s)
adjusted_rand_score 28.71 %
rand_score 82.64 %

Results for NCUT on 128-PCA - k=6 (12.62s)
adjusted_rand_score 30.27 %
rand_score 83.33 %

Results for NCUT on 256-NoDR - k=6 (11.93s)
adjusted_rand_score 29.63 %
rand_score 83.23 %

Results for NCUT on 2-PCA - k=7 (12.06s)
adjusted_rand_score 19.64 %
rand_score 78.99 %

Results for NCUT on 64-PCA - k=7 (12.05s)
adjusted_rand_score 33.28 %
rand_score 85.33 %

Results for NCUT on 128-PCA - k=7 (11.90s)
adjusted_rand_score 33.31 %
rand_score 85.36 %

Results 