In [33]:
from pathlib import Path
import os,sys
import pickle
import pandas as pd

from tqdm.notebook import tqdm

sys.path.insert(0, str(Path().resolve().parents[1]))
import fusemix.pipeline as pipeline 
from fusemix.imputation import MultipleImputer
from fusemix.clustering import external_metrics, compute_MICA, compute_kpod, compute_fusemix,internal_metrics
from gower import gower_matrix
from sklearn.impute import KNNImputer
import warnings

import importlib
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, SpectralClustering
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components

import seaborn as sns 

warnings.filterwarnings("ignore")

In [2]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def write_pickle(var, path):
    with open(path, 'wb') as f:
       pickle.dump(var, f)

In [3]:
test_data_complete = read_pickle("../../test_data/fetched/dataset_15.pkl")
test_data_missing = read_pickle("../../test_data/missing_data/15/0.75_0.5_0.0/data_pipeline_0.pkl")
test_data = read_pickle("../../test_data/imputed_data/15/0.75_0.5_0.0/data_imputed_0.pkl")

In [4]:
test_data_complete['X_complete'].head(2)

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0


In [5]:
test_data[0].head(2)

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
1,5.0,4.0,5.756944,5.0,7.0,10.0,5.985938,2.0,1.889583


In [6]:
test_data_complete['num_classes']

2

In [7]:
# this needs to be set for each experiment, but not for each sampling
seed = 42
rng = np.random.default_rng(seed)
# ================================= #

cat_mask = test_data_complete['cat_mask']
view = test_data[0]
n_features = view.shape[1]
p_min = 0.75*n_features
p_max = 0.85*n_features

num_projections = 3

In [8]:
def generate_projection(data,cat_mask):
    # create projection of the view
    view = data.copy()
    alfa = rng.random()
    selected_features = rng.choice(a=range(n_features), size=round(p_min+alfa*(p_max-p_min)), replace=False)  
    cat_mask_projected = cat_mask[selected_features]
    projected_view = view.iloc[:,selected_features]
    return (projected_view,cat_mask_projected)

all_data  = []
for view in test_data:
    for i in range(num_projections):
        all_data.append(generate_projection(data=view, cat_mask=cat_mask))

In [9]:
def compute_sparse_similarity(data, cat_mask, k):    
    gower_dist = gower_matrix(data, cat_features=cat_mask)
    neighbors_idx = np.argsort(gower_dist, axis=1)[:, 1:k+1]

    # build sparse matrix 
    rows = np.repeat(np.arange(gower_dist.shape[0]), k)
    cols = neighbors_idx.flatten()
    A = csr_matrix((gower_dist[rows, cols], (rows, cols)), shape=gower_dist.shape)

    # simmetrize matrix  
    A = 0.5 * (A + A.T)
    # distance -> similarity
    A.data = 1-A.data

    # normalize where each row sums to 1
    row_sums = np.array(A.sum(axis=1)).flatten()
    row_sums[row_sums == 0] = 1  # avoid division by zero
    A = A.multiply(1 / row_sums[:, None])
    return A

all_projections = [compute_sparse_similarity(prj_view,prj_mask,10) for (prj_view, prj_mask) in all_data]

In [10]:
def compute_spectral(sparse_affinity_mat):
    sc = SpectralClustering(n_clusters=test_data_complete['num_classes'],
                            random_state=seed,
                            affinity="precomputed"
                            #assign_labels="cluster_qr"
                            )

    return sc.fit(sparse_affinity_mat).labels_

all_labels = [compute_spectral(aff_mat) for aff_mat in all_projections]

In [11]:
CO = (np.array(all_labels)[:, :, None] == np.array(all_labels)[:, None, :]).mean(axis=0)

In [12]:
def consensus_clustering(CO, threshold=0.5):
    """
    labels: (M, N) array of cluster assignments
    threshold: co-association cutoff (e.g., 0.5)
    """
    # threshold
    adj = (CO >= threshold).astype(int)
    graph = csr_matrix(adj)

    sc = SpectralClustering(n_clusters=test_data_complete['num_classes'],
                            random_state=seed,
                            affinity="precomputed"
                            #assign_labels="cluster_qr"
                            )
    return sc.fit(graph).labels_

In [15]:
predicted_labels = consensus_clustering(CO)
true_labels = test_data_complete['y_complete'].to_numpy().ravel()

{'ari': 0.9021072608540192,
 'ami': 0.8256714277997358,
 'vm': 0.8258681724158916,
 'cs': 0.8220024982245699}

In [None]:
len(cat_mask)

9

In [None]:
num_clusters = test_data_complete['num_classes']
predicted_mica = compute_MICA(test_data,num_clusters,seed)
predicted_kpod =  compute_kpod(test_data_missing.amputer.incomplete_dataset,num_clusters)
predicted_fusemix = compute_fusemix(test_data,cat_mask,
                    num_clusters,
                    nn_snf=10,
                    seed=seed)

In [None]:
predicted_kpod[]

AttributeError: 'tuple' object has no attribute 'shape'

In [32]:
print(external_metrics(true_labels,predicted_labels))
print(external_metrics(true_labels,predicted_mica))
print(external_metrics(true_labels,predicted_kpod[0]))


{'ari': 0.9021072608540192, 'ami': 0.8256714277997358, 'vm': 0.8258681724158916, 'cs': 0.8220024982245699}
{'ari': 0.8740924498588075, 'ami': 0.7822526874931329, 'vm': 0.7824999602689191, 'cs': 0.7836105678038735}
{'ari': 0.7762605817439933, 'ami': 0.6768644211199069, 'vm': 0.6772393802332708, 'cs': 0.6932098705358005}


In [46]:
num_clusters

2

In [47]:
test_data_complete['X_complete']

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0
2,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0
3,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0
4,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
694,3.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0
695,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
696,5.0,10.0,10.0,3.0,7.0,3.0,8.0,10.0,2.0
697,4.0,8.0,6.0,4.0,3.0,4.0,10.0,6.0,1.0


In [48]:
sparse_mat = compute_sparse_similarity(test_data_complete['X_complete'], cat_mask, 10)
cca_spectral = SpectralClustering(n_clusters=num_clusters, random_state=seed, affinity="precomputed").fit(sparse_mat).labels_

In [49]:
print(external_metrics(cca_spectral,predicted_labels))
print(external_metrics(cca_spectral,predicted_mica))
print(external_metrics(cca_spectral,predicted_kpod[0]))

{'ari': 0.9533413030013026, 'ami': 0.9035578701309003, 'vm': 0.9036663413245779, 'cs': 0.9024901520325989}
{'ari': 0.9475212988640727, 'ami': 0.8996134776455079, 'vm': 0.8997270840673357, 'cs': 0.9040818421466608}
{'ari': 0.8136402062688123, 'ami': 0.7503644839144535, 'vm': 0.7506531370811965, 'cs': 0.7710361684306213}


In [42]:
cca_kmeans = KMeans(n_clusters=num_clusters).fit(test_data_complete['X_complete']).labels_

In [43]:
print(external_metrics(cca_kmeans,predicted_labels))
print(external_metrics(cca_kmeans,predicted_mica))
print(external_metrics(cca_kmeans,predicted_kpod[0]))

{'ari': 0.9076090768258467, 'ami': 0.8543718396534232, 'vm': 0.8545372912352716, 'cs': 0.84495049853657}
{'ari': 0.9589662318964268, 'ami': 0.9227555595967385, 'vm': 0.9228438676542212, 'cs': 0.918045850153877}
{'ari': 0.8899860026536389, 'ami': 0.8306632044290962, 'vm': 0.8308610493495857, 'cs': 0.844712123188088}


In [34]:
gower_dist_complete = gower_matrix(test_data_complete['X_complete'],cat_features=cat_mask)

print(internal_metrics(predicted_labels, gower_dist_complete, test_data_complete['X_complete']))
print(internal_metrics(predicted_mica, gower_dist_complete, test_data_complete['X_complete']))
print(internal_metrics(predicted_kpod[0], gower_dist_complete, test_data_complete['X_complete']))

{'sh': 0.6365898251533508, 'ch': 977.2736859700723, 'db': 0.7706203861140039}
{'sh': 0.6469982862472534, 'ch': 1012.7287007507688, 'db': 0.7612523191724858}
{'sh': 0.6496018767356873, 'ch': 978.7019520839181, 'db': 0.7671818686097436}
