In [1]:
from pathlib import Path
import os,sys
import pickle
import pandas as pd
import seaborn as sns 
import numpy as np
import importlib
import yaml
from tqdm.notebook import tqdm
from itertools import product
import time
from datetime import datetime

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

sys.path.insert(0, str(Path().resolve().parents[1]))

from gower import gower_matrix

import fusemix.clustering as clust_utils 
import fusemix.mige as migeClust
from fusemix.mica import compute_MICA
from fusemix.mixture_missing import run_mghm, run_mcnm
from fusemix.evaluation_metrics import *

importlib.reload(migeClust)
importlib.reload(clust_utils)


import warnings
warnings.filterwarnings('ignore')

Error importing in API mode: ImportError('On Windows, cffi mode "ANY" is only "ABI".')
Trying to import in ABI mode.


In [2]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def write_pickle(var, path):
    with open(path, 'wb') as f:
       pickle.dump(var, f)

In [3]:
with open("../../test_data/simulation_config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

In [4]:
configs = list(product(
    cfg['dataset_ids'],
    cfg['md_param_grid']['props'],
    cfg['md_param_grid']['mf_proportions'],
    cfg['md_param_grid']['mnar_proportions'],
    range(cfg['n_runs'])
))

TEST_DATASETS = [15,17,33,45]

In [5]:
simulations_results = {}


filtered_configs = [cf for cf in configs if cf[0] in TEST_DATASETS]

for conf in tqdm(filtered_configs):

    dataset_id = conf[0]


    # test only datasets TEST_DATASETS

    md_config = str(conf[1])+"_"+str(conf[2])+"_"+str(conf[3])
    seed = conf[4]
        
    """
    Load data for simulation
    """
    test_data_complete = read_pickle("../../test_data/fetched/dataset_"+str(dataset_id)+".pkl")
    test_data_missing = read_pickle("../../test_data/missing_data/"+str(dataset_id)+"/"+md_config+"/data_pipeline_"+str(seed)+".pkl")
    test_data = read_pickle("../../test_data/imputed_data/"+str(dataset_id)+"/"+md_config+"/data_imputed_"+str(seed)+".pkl")

    """
    Prepare data input
    """
    incomplete_data = test_data_missing.amputer.incomplete_dataset
    complete_data = test_data_complete['X_complete']
    true_labels = test_data_complete['y_complete'].values.flatten()
    cat_mask = test_data_complete['cat_mask']
    num_classes = test_data_complete['num_classes']
    multiple_imputed_data = test_data
    num_imputations = len(multiple_imputed_data)

    # hyperparameters
    consensuns_thresholds = [0.2,0.3,0.5,0.7]
    knn_numbers = [5,10,15,20] 

    # From configuration file
    num_projections = cfg['mige_param']['num_projections']
    p_min = cfg['mige_param']['p_min']
    p_max = cfg['mige_param']['p_max']
    
    
    for knn, co_thresh in product(knn_numbers, consensuns_thresholds):

        # compute labels
        mige_labels = migeClust.mige(
                        multiple_imputed_data,
                        n_clusters=num_classes,
                        cat_mask=cat_mask,
                        seed=seed,
                        p_min = p_min,
                        p_max = p_max,
                        num_projections = num_projections,
                        k_nn = knn,
                        co_threshold = co_thresh
                    )
   
        # evaulate performance metrics
        try:
            int_metrics = internal_metrics(mige_labels, complete_data, cat_mask)
            ext_metrics = external_metrics(true_labels, mige_labels)
        except:
            int_metrics = np.nan
            ext_metrics = np.nan

        
        simulations_results[conf + (knn, co_thresh)] = {}
        simulations_results[conf + (knn, co_thresh)]['internal_metrics'] = int_metrics
        simulations_results[conf + (knn, co_thresh)]['external_metrics'] = ext_metrics
    
    time.sleep(1)


  0%|          | 0/480 [00:00<?, ?it/s]

In [6]:
# dump_data
timestamp = datetime.now().strftime("%d_%m_%Y")
write_pickle(simulations_results,"../../test_output/sensitivity_results"+timestamp+".pkl")