<hr style="border:2px solid RosyBrown"> </hr>
<hr style="border:1px solid Wheat"> </hr>

# Constrained Markov Clustering

<hr style="border:1px solid Wheat"> </hr>
<hr style="border:2px solid RosyBrown"> </hr>

Load all packages and modules

In [20]:
%load_ext autoreload
%autoreload 2

#------------------------------------------------------------------------------#
# Import modules
from sklearn import datasets, decomposition
from sklearn.metrics.cluster import normalized_mutual_info_score
import sys
import os
import numpy as np
import pandas as pd
#------------------------------------------------------------------------------#

#------------------------------------------------------------------------------#
# Import custom classes and functions
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
    
from models.data_gen import dataGen
from models.CoMaC import CoMaC

from utils.callback import save_results
from utils.helperFunc import partition_to_labels, generate_int_labels
from utils.plotting import show_clustering, show_transition_prob
#------------------------------------------------------------------------------#

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
#------------------------------------------------------------------------------#
# Select from different datasets
#------------------------------------------------------------------------------#

dataset_str = 'USER'

if dataset_str == 'IRIS':
    iris = datasets.load_iris()
    X = iris.data[:, :]
    labels_true = iris.target

elif dataset_str == 'WINE':
    wine = datasets.load_wine()
    X = wine.data[:, :]
    labels_true = wine.target

elif dataset_str == 'WINE_SCALED':
    wine_df = pd.read_csv("../data/rand_samples_links/wine-scaled.in",
                          header=None, delimiter=",")
    X = wine_df.loc[:, 0:12].to_numpy()
    labels_true = wine_df.loc[:,13].to_numpy()

elif dataset_str == 'GLASS':
    glass_df = pd.read_csv("../data/data-sets/glass.csv", header=None)
    X = glass_df.loc[:, 0:8].to_numpy()
    labels_true = glass_df.loc[:, 9].to_numpy()

elif dataset_str == 'ECOLI':
    ecoli_df = pd.read_csv("../data/data-sets/ecoli.csv", header=None)
    X = ecoli_df.loc[:, 0:6].to_numpy()
    pca = decomposition.PCA(n_components=5)
    pca.fit(X)
    X = pca.transform(X)
    labels_true = ecoli_df.loc[:, 7].to_numpy()
    X = X[:327, :]
    labels_true = labels_true[:327]

elif dataset_str == 'VERTEBRAL':
    vertebral_df = pd.read_csv("../data/data-sets/vertebral.data",
                               skiprows=[0], header=None, delimiter=" ")
    X = vertebral_df.loc[:, 0:5].to_numpy()
    labels_df = pd.read_csv("../data/reference-labelling/vertebral.ref",
                            skiprows=[0], header=None, delimiter=" ")
    labels_true = labels_df.to_numpy()

elif dataset_str == 'SEGMENTATION':
    segmentation_df = pd.read_csv("../data/data-sets/segmentation.data",
                                  skiprows=[0], header=None, delimiter=" ")
    X = segmentation_df.loc[:, 0:4].to_numpy()
    labels_df = pd.read_csv("../data/reference-labelling/segmentation.ref",
                            skiprows=[0], header=None, delimiter=" ")
    labels_true = labels_df.to_numpy()

elif dataset_str == 'USER':
    user_df = pd.read_csv("../data/data-sets/user.data",
                          skiprows=[0], header=None, delimiter=" ")
    X = user_df.loc[:, 0:4].to_numpy()
    labels_df = pd.read_csv("../data/reference-labelling/user.ref",
                            skiprows=[0], header=None, delimiter=" ")
    labels_true = labels_df.to_numpy()

else:
    dataGenerator = dataGen()
    P, V_true, X = dataGenerator.generateCircles()
    labels_true = partition_to_labels(V_true)

labels_true = generate_int_labels(labels_true)
M = len(np.unique(labels_true))

print(f'{dataset_str} dataset with {X.shape[1]} features, {X.shape[0]} samples and {M} classes.')


USER dataset with 5 features, 403 samples and 4 classes.


In [50]:
#------------------------------------------------------------------------------#
# Parameters

knns=20
restarts=5
percentage=0.2
final_beta=0.0

#------------------------------------------------------------------------------#

In [51]:
#------------------------------------------------------------------------------#
# Initialize and generate constraints

comac = CoMaC(M=M, knns=knns, restarts=restarts)
comac.constraints(X, labels_true, percentage=percentage,
                  wrong_percentage=0, ClassLabels=2)
#------------------------------------------------------------------------------#

In [52]:
#------------------------------------------------------------------------------#
# Sequential algorithm
final_beta = 0.5 
print('~'*80 + f'\n Sequential Algorithm: beta = {final_beta}')
cost_seq, V_seq = comac.cluster_seq(X, beta=final_beta)

#------------------------------------------------------------------------------#

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Sequential Algorithm: beta = 0.5
*************************Clustering Finished!*************************


In [53]:
#------------------------------------------------------------------------------#
labels_seq = partition_to_labels(V_seq) 
NMI_seq = normalized_mutual_info_score(labels_true, labels_seq)

print(f'Sequential NMI = {NMI_seq} (beta={final_beta})')
#------------------------------------------------------------------------------#

Sequential NMI = 0.7484558685537503 (beta=0.5)


In [18]:
#------------------------------------------------------------------------------#
# Annealing algorithm

print('~'*80 + f'\n Annealing Algorithm: final beta = {final_beta}')
cost_ann, V_ann, beta_vec = comac.cluster_ann(X, final_beta=final_beta, step_size=-0.5)

#------------------------------------------------------------------------------#

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Annealing Algorithm: final beta = 0.5
Starting with beta = 1
*************************Clustering Finished!*************************
Starting with beta = 0.5
*************************Clustering Finished!*************************


In [19]:
#------------------------------------------------------------------------------#

labels_ann = []
NMI_ann = np.zeros_like(beta_vec)

for idx, beta in enumerate(beta_vec):
    labels_ann.append( partition_to_labels(V_ann[idx, :, :]) )
    NMI_ann[idx] = normalized_mutual_info_score(labels_true,
                                                labels_ann[-1])
    print(f'NMI = {NMI_ann[idx]} (beta={beta}) \n')
    
#------------------------------------------------------------------------------#

NMI = 0.9305506621576433 (beta=1.0) 

NMI = 0.94878832043511 (beta=0.5) 



<hr style="border:2px solid DarkKhaki"> </hr>
