In [1]:
from pathlib import Path
import os,sys
import pickle
import pandas as pd

from tqdm.notebook import tqdm
import importlib

sys.path.insert(0, str(Path().resolve().parents[1]))
import fusemix.pipeline as pipeline 


In [2]:
def load_dataset(id):
    with open("../../test_data/fetched/dataset_"+str(id)+".pkl", "rb") as input:
        file = pickle.load(input)
    return file

In [3]:
ids = pd.read_csv("../../test_output/datasets_analysis.csv")['Id']

loaded_data = {}
for id in ids: loaded_data[id] = load_dataset(id = id)

In [4]:
n_runs = 10

md_param_grid = {
    'props': [0.75,1.],
    'mf_proportions': [0.5,0.75],
    'mnar_proportions': [0.,0.25,0.5]
}


In [5]:
for id in ids:
    # create directory for dataset id in test_data/missing_data
    directory = "../../test_data/missing_data/"+str(id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    # create directory for dataset id in test_data/missing_data/parameters
    for prop in md_param_grid['props']:
        for mf_proportion in md_param_grid['mf_proportions']:
            for mnar_proportion in md_param_grid['mnar_proportions']:
                directory = "../../test_data/missing_data/"+str(id)+"/"+str(prop)+"_"+str(mf_proportion)+"_"+str(mnar_proportion)
                if not os.path.exists(directory):
                    os.makedirs(directory)

### Generate amputations 

In [6]:
for id in tqdm(ids):

    data_ = loaded_data[id]
    print(data_['X_complete'].shape)
    print(id)
    for prop in tqdm(md_param_grid['props']):
        for mf_proportion in tqdm(md_param_grid['mf_proportions']):
            for mnar_proportion in tqdm(md_param_grid['mnar_proportions']):
                
                directory = "../../test_data/missing_data/"+str(id)+"/"+str(prop)+"_"+str(mf_proportion)+"_"+str(mnar_proportion)
                
                for seed in tqdm(range(n_runs)):
                    print(seed)
                    cfg = pipeline.DataPipelineConfig(seed=seed,
                                                      prop=prop,
                                                      mnar_freq=mnar_proportion,
                                                      mf_proportion=mf_proportion,
                                                      complete_data=data_['X_complete'],
                                                      num_classes=data_['num_classes'],
                                                      verbose=False,
                                                      target=data_['y_complete'])
                    pipeline_ = pipeline.PipelineDataGeneration(cfg)
                    pipeline_.run()
                    
                    directory = "../../test_data/missing_data/"+str(id)+"/"+str(prop)+"_"+str(mf_proportion)+"_"+str(mnar_proportion)
                    with open(directory+"/data_pipeline_"+str(seed)+".pkl", 'wb') as f:  
                        pickle.dump(pipeline_, f)


  0%|          | 0/6 [00:00<?, ?it/s]

(195, 20)
174


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0
1
2
3
4




5
6
7
8
9


  0%|          | 0/10 [00:00<?, ?it/s]



0




1
2
3
4
5




6




7
8
9


  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2
3




4
5
6




7
8




9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0




1
2




3




4




5
6




7
8




9




  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2
3




4
5




6




7




8
9




  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2




3




4
5
6




7




8




9




  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2




3




4




5
6




7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2




3




4




5
6
7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0
1
2
3




4
5




6
7
8
9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2
3




4
5
6




7
8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2
3




4




5
6
7




8
9




  0%|          | 0/10 [00:00<?, ?it/s]



0
1
2




3
4
5




6
7
8




9
(569, 30)
17


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3




4




5




6




7




8




9




  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2
3




4




5




6
7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3
4
5




6
7




8
9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2




3
4




5




6
7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0




1
2




3




4




5




6




7
8




9




  0%|          | 0/10 [00:00<?, ?it/s]



0
1
2




3
4
5
6




7
8
9


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0




1
2




3




4
5




6




7
8




9




  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2




3
4




5




6
7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3
4




5




6




7
8
9




  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0
1




2




3
4




5




6
7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3




4




5




6




7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3




4




5




6




7




8




9
(358, 34)
33


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0


  data_group = stats.zscore(data_group)


1




2




3




4




5




6




7




8




9




  0%|          | 0/10 [00:00<?, ?it/s]



0


  data_group = stats.zscore(data_group)


1




2




3
4




5




6
7




8




9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0




1




2




3
4




5




6




7
8




9




  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3




4




5




6




7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3




4




5




6




7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3




4




5




6
7




8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0




1




2




3




4




5




6




7




8




9




  0%|          | 0/10 [00:00<?, ?it/s]



0


  data_group = stats.zscore(data_group)


1




2




3
4




5




6




7
8




9




  0%|          | 0/10 [00:00<?, ?it/s]



0


  data_group = stats.zscore(data_group)


1
2




3




4




5
6




7




8




9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

0




1




2




3




4




5




6




7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3




4




5




6




7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]



0




1




2




3
4




5




6
7
8




9


  data_group = stats.zscore(data_group)


(297, 13)
45


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1
2


  data_group = stats.zscore(data_group)


3
4


  data_group = stats.zscore(data_group)


5
6
7
8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


0
1
2
3


  data_group = stats.zscore(data_group)


4
5




6
7
8
9


  data_group = stats.zscore(data_group)


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


2
3
4




5
6
7


  data_group = stats.zscore(data_group)


8
9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0
1
2


  data_group = stats.zscore(data_group)


3
4




5
6
7


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


8
9


  0%|          | 0/10 [00:00<?, ?it/s]



0




1
2


  data_group = stats.zscore(data_group)


3
4




5
6


  data_group = stats.zscore(data_group)


7
8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/10 [00:00<?, ?it/s]



0
1


  data_group = stats.zscore(data_group)


2
3
4




5




6


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


7
8
9


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)


2
3
4


  data_group = stats.zscore(data_group)


5
6


  data_group = stats.zscore(data_group)


7
8
9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)


2
3
4


  data_group = stats.zscore(data_group)


5
6




7
8
9


  data_group = stats.zscore(data_group)


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)


2
3
4


  data_group = stats.zscore(data_group)


5
6


  data_group = stats.zscore(data_group)


7
8
9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0
1


  data_group = stats.zscore(data_group)


2
3
4




5
6


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


7
8
9


  0%|          | 0/10 [00:00<?, ?it/s]



0
1


  data_group = stats.zscore(data_group)


2
3
4




5
6


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


7
8
9




  0%|          | 0/10 [00:00<?, ?it/s]



0
1


  data_group = stats.zscore(data_group)


2
3
4




5
6


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


7
8
9
(683, 9)
15


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


2
3
4




5


  data_group = stats.zscore(data_group)


6
7


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


8
9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


1
2




3
4


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


5
6




7


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


8
9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


2




3


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


4
5


  data_group = stats.zscore(data_group)


6
7
8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


2
3


  data_group = stats.zscore(data_group)


4




5
6




7
8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)


2
3


  data_group = stats.zscore(data_group)


4




5
6




7
8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


0
1
2


  data_group = stats.zscore(data_group)


3


  data_group = stats.zscore(data_group)


4
5




6
7
8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


1
2
3


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


4




5


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


6
7
8
9


  0%|          | 0/10 [00:00<?, ?it/s]



0


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


1
2
3
4


  data_group = stats.zscore(data_group)


5
6


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


7
8
9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


2
3
4
5


  data_group = stats.zscore(data_group)


6
7


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


8
9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



0


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


1
2


  data_group = stats.zscore(data_group)


3


  data_group = stats.zscore(data_group)


4




5
6
7




8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


0
1


  data_group = stats.zscore(data_group)


2




3


  data_group = stats.zscore(data_group)


4
5




6




7
8


  data_group = stats.zscore(data_group)


9


  0%|          | 0/10 [00:00<?, ?it/s]



0
1


  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)
  data_group = stats.zscore(data_group)


2
3




4


  data_group = stats.zscore(data_group)


5




6
7




8
9


  data_group = stats.zscore(data_group)


(2111, 16)
544


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0




1




2




3




4
5




6


  data_group = stats.zscore(data_group)


7
8




9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1




2
3




4
5
6


  data_group = stats.zscore(data_group)


7
8
9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0
1
2




3
4


  data_group = stats.zscore(data_group)


5
6
7




8
9


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)


1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)


1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)


1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7
8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0




1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7




8




9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0




1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7




8




9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0




1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7




8




9




  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)


1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)


1




2




3




4




5




6


  data_group = stats.zscore(data_group)


7




8


  data_group = stats.zscore(data_group)


9




  0%|          | 0/10 [00:00<?, ?it/s]

  data_group = stats.zscore(data_group)


0


  data_group = stats.zscore(data_group)


1




2
3




4
5


  data_group = stats.zscore(data_group)


6
7


  data_group = stats.zscore(data_group)


8
9


