In [1]:
from pathlib import Path
import os,sys
import pickle
import pandas as pd
import seaborn as sns 
import numpy as np
import importlib
import yaml
from tqdm.notebook import tqdm
from itertools import product
import time
from datetime import datetime

sys.path.insert(0, str(Path().resolve().parents[1]))

from gower import gower_matrix

import fusemix.clustering as clust_utils 
import fusemix.mige as migeClust
from fusemix.mica import compute_MICA
from fusemix.mixture_missing import run_mghm, run_mcnm
from fusemix.evaluation_metrics import *

importlib.reload(migeClust)
importlib.reload(clust_utils)


import warnings
warnings.filterwarnings('ignore')


Error importing in API mode: ImportError('On Windows, cffi mode "ANY" is only "ABI".')
Trying to import in ABI mode.


In [2]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def write_pickle(var, path):
    with open(path, 'wb') as f:
       pickle.dump(var, f)

In [3]:
with open("../../test_data/simulation_config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

In [4]:
simulations_results = {}

In [5]:
# time of computation 

time_computation = {
      'mige_no_proj': 0,
      'mige_proj': 0,
      'mige_no_proj_mutual': 0,
      'mige_proj_mutual': 0,
      'mica': 0,
      'kpod': 0,
      'mcnm': 0,
      'mghm': 0,
      'sc_knn': 0,
      'sc_mi': 0,
      'km_knn': 0,
      'km_si': 0,
      'cca_spectral': 0,
      'cca_kmeans': 0
   }

# Simulation run

In [6]:
configs = list(product(
    cfg['dataset_ids'],
    cfg['md_param_grid']['props'],
    cfg['md_param_grid']['mf_proportions'],
    cfg['md_param_grid']['mnar_proportions'],
    range(cfg['n_runs'])
))


for conf in tqdm(configs):

   dataset_id = conf[0]
   md_config = str(conf[1])+"_"+str(conf[2])+"_"+str(conf[3])
   seed = conf[4]
    
   """
   Load data for simulation
   """
   test_data_complete = read_pickle("../../test_data/fetched/dataset_"+str(dataset_id)+".pkl")
   test_data_missing = read_pickle("../../test_data/missing_data/"+str(dataset_id)+"/"+md_config+"/data_pipeline_"+str(seed)+".pkl")
   test_data = read_pickle("../../test_data/imputed_data/"+str(dataset_id)+"/"+md_config+"/data_imputed_"+str(seed)+".pkl")

   """
   Prepare data input
   """
   incomplete_data = test_data_missing.amputer.incomplete_dataset
   complete_data = test_data_complete['X_complete']
   true_labels = test_data_complete['y_complete'].values.flatten()
   cat_mask = test_data_complete['cat_mask']
   num_classes = test_data_complete['num_classes']
   multiple_imputed_data = test_data


   """
   MIGE
   """

   num_projections = 5
   num_imputations = len(multiple_imputed_data)

   co_threshold = 1/np.sqrt(num_imputations)



   start = time.perf_counter()
   mige_labels_no_proj = migeClust.mige(
                  multiple_imputed_data,
                  n_clusters=num_classes,
                  cat_mask=cat_mask,
                  seed=seed,
                  p_min = 1,
                  p_max = 1,
                  num_projections = 1,
                  k_nn = 15,
                  co_threshold = co_threshold,
                  mutual = False
               )
   time_computation['mige_no_proj'] = time.perf_counter() - start
   
   start = time.perf_counter()
   mige_labels_proj = migeClust.mige(
                  multiple_imputed_data,
                  n_clusters=num_classes,
                  cat_mask=cat_mask,
                  seed=seed,
                  p_min = 0.8,
                  p_max = 0.95,
                  num_projections = num_projections,
                  k_nn = 15,
                  co_threshold = co_threshold,
                  mutual = False
               )
   time_computation['mige_proj'] = time.perf_counter() - start
   """
   MIGE - Mutual
   """
   start = time.perf_counter()
   mige_labels_no_proj_mutual = migeClust.mige(
                  multiple_imputed_data,
                  n_clusters=num_classes,
                  cat_mask=cat_mask,
                  seed=seed,
                  p_min = 1,
                  p_max = 1,
                  leiden = True,
                  num_projections = 1,
                  k_nn = 15,
                  co_threshold = co_threshold,
                  mutual = True
               )
   time_computation['mige_no_proj_mutual'] = time.perf_counter() - start

   start = time.perf_counter()
   mige_labels_proj_mutual = migeClust.mige(
                  multiple_imputed_data,
                  n_clusters=num_classes,
                  cat_mask=cat_mask,
                  seed=seed,
                  p_min = 0.8,
                  p_max = 0.95,
                  leiden = True,
                  num_projections = num_projections,
                  k_nn = 15,
                  co_threshold = co_threshold,
                  mutual = True
               )
   time_computation['mige_proj_mutual'] = time.perf_counter() - start


   """
   MICA
   """
   start = time.perf_counter()
   mica_labels = compute_MICA(
      multiple_imputed_data,
      num_clusters=num_classes,
      seed=seed
   )
   time_computation['mica'] = time.perf_counter() - start

   """
   Kpod
   """
   start = time.perf_counter()
   kpod_labels = clust_utils.compute_kpod(
    incomplete_data,
    num_clusters=num_classes,
    seed=seed
   )
   time_computation['kpod'] = time.perf_counter() - start


   """
   Mixture Missing
   """

   start = time.perf_counter()
   try:
    mghm_labels = run_mghm(
        incomplete_data,
        G=num_classes,
        seed=seed
        )
   except RuntimeError:
      mghm_labels = None 
   time_computation['mghm'] = time.perf_counter() - start

   start = time.perf_counter()
   try:
      mcnm_labels = run_mcnm(
         incomplete_data,
         G=num_classes,
         seed=seed
         )
   except RuntimeError:
      mcnm_labels = None 
   time_computation['mcnm'] = time.perf_counter() - start

   """
   Naive methods
   """
   start = time.perf_counter()
   sc_si_knn_labels = clust_utils.compute_spectral_si_knn(
      incomplete_data,
      seed=seed,
      num_clusters=num_classes,
      cat_mask=cat_mask
   )
   time_computation['sc_knn'] = time.perf_counter() - start

   start = time.perf_counter()
   sc_si_mi_labels = clust_utils.compute_spectral_si_mi(
      multiple_imputed_data,
      seed=seed,
      num_clusters=num_classes,
      cat_mask=cat_mask
   )
   time_computation['sc_mi'] = time.perf_counter() - start

   start = time.perf_counter()
   km_si_knn_labels = clust_utils.compute_kmeans_si_knn(
      incomplete_data,
      num_clusters=num_classes,
      seed=seed
   )
   time_computation['km_knn'] = time.perf_counter() - start

   start = time.perf_counter()
   km_si_mi_labels = clust_utils.compute_kmeans_si_mi(
      multiple_imputed_data,
      num_clusters=num_classes,
      seed=seed
   )
   time_computation['km_si'] = time.perf_counter() - start
    
   """
   CCA analyses
   """
   start = time.perf_counter()
   cca_spectral_labels = clust_utils.compute_spectral_complete(
      complete_data,
      cat_mask=cat_mask,
      num_clusters=num_classes,
      seed=seed
   )
   time_computation['cca_spectral'] = time.perf_counter() - start

   start = time.perf_counter()
   cca_kmeans_labels = clust_utils.compute_kmeans_complete(
      complete_data,
      num_clusters=num_classes,
      seed=seed
   )
   time_computation['cca_kmeans'] = time.perf_counter() - start

   
   predicted_labels = {
      'mige_no_proj': mige_labels_no_proj,
      'mige_proj': mige_labels_proj,
      'mige_no_proj_mutual': mige_labels_no_proj_mutual,
      'mige_proj_mutual': mige_labels_proj_mutual,
      'mica': mica_labels,
      'kpod': kpod_labels,
      'mcnm': mcnm_labels,
      'mghm': mghm_labels,
      'sc_knn': sc_si_knn_labels,
      'sc_mi': sc_si_mi_labels,
      'km_knn': km_si_knn_labels,
      'km_si': km_si_mi_labels,
      'cca_spectral': cca_spectral_labels,
      'cca_kmeans': cca_kmeans_labels
   }

   int_metrics = dict.fromkeys(predicted_labels.keys())
   ext_metrics = dict.fromkeys(predicted_labels.keys())

   for method,comp in zip(predicted_labels.keys(),predicted_labels.values()):
    try:
        int_metrics[method] = internal_metrics(comp, complete_data, cat_mask)
        ext_metrics[method] = external_metrics(true_labels, comp)
    except:
        int_metrics[method] = np.nan
        ext_metrics[method] = np.nan

   simulations_results[conf] = {}
   simulations_results[conf]['internal_metrics'] = int_metrics
   simulations_results[conf]['external_metrics'] = ext_metrics

   time.sleep(3)

  0%|          | 0/720 [00:00<?, ?it/s]

R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 4 is not positive
  
R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 24 is not positive
  
R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 26 is not positive
  
R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 4 is not positive
  
R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 26 is not positive
  
R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 26 is not positive
  
R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 6 is not positive
  
R callback write-console: Error in chol.default(x, pivot = FALSE) : 
  the leading minor of order 26 is not positive
  
R callback write-console: Error in chol.def

KeyboardInterrupt: 

## Dump results

In [None]:
timestamp = datetime.now().strftime("%d_%m_%Y")

write_pickle(simulations_results,"../../test_output/simulation_results"+timestamp+".pkl")