Why projection doesn't work as expected?

In [1]:
from pathlib import Path
import os,sys
import pickle
import pandas as pd
import seaborn as sns 
import numpy as np
import importlib
import yaml
from tqdm.notebook import tqdm
from itertools import product
import time
from datetime import datetime

sys.path.insert(0, str(Path().resolve().parents[1]))

from gower import gower_matrix

import fusemix.clustering as clust_utils 
import fusemix.mige as migeClust
from fusemix.mica import compute_MICA
from fusemix.mixture_missing import run_mghm, run_mcnm
from fusemix.evaluation_metrics import *

importlib.reload(migeClust)
importlib.reload(clust_utils)


import warnings
warnings.filterwarnings('ignore')

Error importing in API mode: ImportError('On Windows, cffi mode "ANY" is only "ABI".')
Trying to import in ABI mode.


In [2]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def write_pickle(var, path):
    with open(path, 'wb') as f:
       pickle.dump(var, f)

In [3]:
with open("../../test_data/simulation_config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

In [15]:
configs = list(product(
    cfg['dataset_ids'],
    cfg['md_param_grid']['props'],
    cfg['md_param_grid']['mf_proportions'],
    cfg['md_param_grid']['mnar_proportions'],
    range(cfg['n_runs'])
))

simulations_results = {}

TESTED_CONF = configs[40:45]

for conf in tqdm(TESTED_CONF):

   dataset_id = conf[0]
   md_config = str(conf[1])+"_"+str(conf[2])+"_"+str(conf[3])
   seed = conf[4]
    
   """
   Load data for simulation
   """
   test_data_complete = read_pickle("../../test_data/fetched/dataset_"+str(dataset_id)+".pkl")
   test_data_missing = read_pickle("../../test_data/missing_data/"+str(dataset_id)+"/"+md_config+"/data_pipeline_"+str(seed)+".pkl")
   test_data = read_pickle("../../test_data/imputed_data/"+str(dataset_id)+"/"+md_config+"/data_imputed_"+str(seed)+".pkl")

   """
   Prepare data input
   """
   incomplete_data = test_data_missing.amputer.incomplete_dataset
   complete_data = test_data_complete['X_complete']
   true_labels = test_data_complete['y_complete'].values.flatten()
   cat_mask = test_data_complete['cat_mask']
   num_classes = test_data_complete['num_classes']
   multiple_imputed_data = test_data


   """
   MIGE
   """

   num_projections = 3
   num_imputations = len(multiple_imputed_data)

   co_threshold_no_proj = 1/np.sqrt(num_imputations)
   co_threshold_proj = 1/np.sqrt(num_imputations)



   mige_labels_no_proj = migeClust.mige(
                  multiple_imputed_data,
                  n_clusters=num_classes,
                  cat_mask=cat_mask,
                  seed=seed,
                  p_min = 1,
                  p_max = 1,
                  num_projections = 1,
                  k_nn = 20,
                  co_threshold = co_threshold_no_proj,
                  mutual = True
               )
   
   mige_labels_proj = migeClust.mige(
                  multiple_imputed_data,
                  n_clusters=num_classes,
                  cat_mask=cat_mask,
                  seed=seed,
                  p_min = 0.8,
                  p_max = 0.9,
                  num_projections = num_projections,
                  k_nn = 20,
                  co_threshold = co_threshold_proj,
                  mutual = True
               )

   predicted_labels = {
      'mige_no_proj': mige_labels_no_proj,
      'mige_proj': mige_labels_proj,
   }

   ext_metrics = dict.fromkeys(predicted_labels.keys())

   for method,comp in zip(predicted_labels.keys(),predicted_labels.values()):
    try:
        ext_metrics[method] = external_metrics(true_labels, comp)
    except:
        ext_metrics[method] = np.nan

   simulations_results[conf] = {}
   simulations_results[conf]['external_metrics'] = ext_metrics

   time.sleep(1)

  0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
ari_results = [res['external_metrics']['mige_no_proj']['ami'] for res in simulations_results.values()]
ari_results_proj = [res['external_metrics']['mige_proj']['ami'] for res in simulations_results.values()]

print(f"No projection {np.median(ari_results)} + {np.std(ari_results)}")
print(f"Projection {np.median(ari_results_proj)} + {np.std(ari_results_proj)}")

No projection 0.761799097560578 + 0.06672544689008217
Projection 0.7931047708838787 + 0.03220503695382459


In [110]:
A_sim * A_sim_mut_mask

array([[1.        , 0.        , 0.60125864, 0.        , 0.7222599 ],
       [0.        , 1.        , 0.5631492 , 0.72715664, 0.5451331 ],
       [0.60125864, 0.5631492 , 1.        , 0.        , 0.8364636 ],
       [0.        , 0.72715664, 0.        , 1.        , 0.        ],
       [0.7222599 , 0.5451331 , 0.8364636 , 0.        , 1.        ]],
      dtype=float32)

In [15]:
def mige_project(
    multiple_imputed_data,
    n_clusters,
    cat_mask = None,
    seed = None,
    p_min = 0.75,
    p_max = 0.85,
    num_projections = 5,
    leiden = False,
    k_nn = 10,
    co_threshold = 0.5
    ):
    """
    MIGEClust function

    cat_mask is required for mixed type data with gower distance
    returns label of clustering as np.array
    """

    # random seed generator
    rng = np.random.default_rng(seed)

    n_features = multiple_imputed_data[0].shape[1]
    p_min_ = p_min*n_features
    p_max_ = p_max*n_features
    
    # Generate all projections for all data
    if num_projections > 0 and p_min < 1:
        all_projections  = []
        for view in multiple_imputed_data:
            for i in range(num_projections):
                all_projections.append(__generate_projection(
                    data=view, 
                    cat_mask=cat_mask,
                    n_features=n_features,
                    rng=rng,
                    p_min_=p_min_,
                    p_max_=p_max_
                    ))
    return all_projections

def __generate_projection(data, cat_mask, n_features, rng, p_min_, p_max_):
    """
    the projection is a subspace of the dataframe
    """
    view = data.copy()
    alfa = rng.random()
    selected_features = rng.choice(a=range(n_features), size=round(p_min_+alfa*(p_max_-p_min_)), replace=False)  
    cat_mask_projected = cat_mask[selected_features]
    projected_view = view.iloc[:,selected_features]
    return (projected_view,cat_mask_projected)

In [16]:
mige_labels_proj = mige_project(
                  multiple_imputed_data,
                  n_clusters=num_classes,
                  cat_mask=cat_mask,
                  seed=seed,
                  p_min = 0.75,
                  p_max = 0.95,
                  num_projections = num_projections,
                  k_nn = 10,
                  co_threshold = co_threshold_proj
               )

In [30]:
len(mige_labels_proj)

50