# Create missing mask for train, val, test tabular data

In [1]:
'''
* Licensed under the Apache License, Version 2.
* By Siyi Du, 2024
'''
import numpy as np
import pandas as pd
from os.path import join, dirname
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
def create_missing_mask(data_tabular_path, mask_path, random_seed, missing_strategy, missing_rate):
    '''
    missing_strategy: value (random value missingness) or feature (random feature missingness)
    missing_rate: 0.0-1.0
    '''
    data_tabular = np.array(pd.read_csv(data_tabular_path, header=None))
    print(f'data tabular shape: {data_tabular.shape}')
    np.random.seed(random_seed)
    M, N = data_tabular.shape[0], data_tabular.shape[1]
    if missing_strategy == 'value':
        missing_mask_data = np.zeros((M*N), dtype=bool)
        mask_pos = np.random.choice(M*N, size=int(M*N*missing_rate), replace=False)
        missing_mask_data[mask_pos] = True
        missing_mask_data = missing_mask_data.reshape((M,N))
    elif missing_strategy == 'feature':
        missing_mask_data = np.zeros((M,N), dtype=bool)
        mask_pos = np.random.choice(N, size=int(N*missing_rate), replace=False)
        missing_mask_data[:,mask_pos] = True
    else:
        raise print('Only support value and feature missing strategy')
    np.save(mask_path, missing_mask_data)
    print(f'Real missing rate: {missing_mask_data.sum()/missing_mask_data.size}')
    print(f'Save missing mask to {mask_path}')
    return missing_mask_data

def create_certain_missing_mask(data_tabular_path, mask_path, mask_pos_order, missing_strategy, missing_rate):
    '''Create mask according to a mask order list (for MI and LI feature missingness)'''
    data_tabular = np.array(pd.read_csv(data_tabular_path, header=None))
    print(f'data tabular shape: {data_tabular.shape}')
    M, N = data_tabular.shape[0], data_tabular.shape[1]
    assert N == len(mask_pos_order)
    mask_pos = mask_pos_order[:int(N*missing_rate)]
    missing_mask_data = np.zeros((M,N), dtype=bool)
    missing_mask_data[:,mask_pos] = True
    np.save(mask_path, missing_mask_data)
    print(f'Real missing rate: {missing_mask_data.sum()/missing_mask_data.size}')
    print(f'Save missing mask to {mask_path}')
    return missing_mask_data

## cardiac

In [2]:
# TODO: change to your own path
FEATURES = '/bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final'
MASK_PATH = join(FEATURES, 'missing_mask')

#### Random Mask

In [4]:
missing_strategy = 'value' 
missing_rate = 0.0

for target in ['CAD', 'Infarction']:
    train_name = 'cardiac_features_train_imputed_noOH_tabular_imaging_reordered.csv'
    val_name = 'cardiac_features_val_imputed_noOH_tabular_imaging_reordered.csv'
    test_name = 'cardiac_features_test_imputed_noOH_tabular_imaging_reordered.csv'
    for name, seed, split in zip([train_name, val_name, test_name], [2021,2022,2023], ['train', 'val', 'test']):
        save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
        path = join(FEATURES, name)
        # print(path)
        create_missing_mask(path, save_mask_path, seed, missing_strategy, missing_rate)

    balanced_train_name = f'cardiac_features_train_imputed_noOH_tabular_imaging_{target}_balanced_reordered.csv'
    balanced_path = join(FEATURES, balanced_train_name)
    balanced_save_mask_path = join(MASK_PATH, f'{balanced_train_name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    create_missing_mask(balanced_path, balanced_save_mask_path, 2021, missing_strategy, missing_rate)


data tabular shape: (26040, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_train_imputed_noOH_tabular_imaging_reordered_CAD_value_0.0.npy
data tabular shape: (6510, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_val_imputed_noOH_tabular_imaging_reordered_CAD_value_0.0.npy
data tabular shape: (3617, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_test_imputed_noOH_tabular_imaging_reordered_CAD_value_0.0.npy
data tabular shape: (3482, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_train_imputed_noOH_tabular_imaging_CAD_balanced_reordered_CAD_value_0.0.n

In [5]:
missing_strategy = 'feature'

for target in ['CAD', 'Infarction']:
    train_name = 'cardiac_features_train_imputed_noOH_tabular_imaging_reordered.csv'
    val_name = 'cardiac_features_val_imputed_noOH_tabular_imaging_reordered.csv'
    test_name = 'cardiac_features_test_imputed_noOH_tabular_imaging_reordered.csv'
    for name, seed, split in zip([train_name, val_name, test_name], [2022,2022,2022], ['train', 'val', 'test']):
        save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
        path = join(FEATURES, name)
        # print(path)
        create_missing_mask(path, save_mask_path, seed, missing_strategy, missing_rate)
    balanced_train_name = f'cardiac_features_train_imputed_noOH_tabular_imaging_{target}_balanced_reordered.csv'
    balanced_path = join(FEATURES, balanced_train_name)
    balanced_save_mask_path = join(MASK_PATH, f'{balanced_train_name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    create_missing_mask(balanced_path, balanced_save_mask_path, 2022, missing_strategy, missing_rate)


data tabular shape: (26040, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_train_imputed_noOH_tabular_imaging_reordered_CAD_feature_0.0.npy
data tabular shape: (6510, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_val_imputed_noOH_tabular_imaging_reordered_CAD_feature_0.0.npy
data tabular shape: (3617, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_test_imputed_noOH_tabular_imaging_reordered_CAD_feature_0.0.npy
data tabular shape: (3482, 75)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_train_imputed_noOH_tabular_imaging_CAD_balanced_reordered_CAD_featu

In [6]:
# Check
train_np = np.load(join(MASK_PATH, f'{train_name[:-4]}_CAD_feature_0.3.npy'))
val_np = np.load(join(MASK_PATH, f'{val_name[:-4]}_CAD_feature_0.3.npy'))
test_np = np.load(join(MASK_PATH, f'{test_name[:-4]}_CAD_feature_0.3.npy'))
print(train_np[0])
print(val_np[0])
print(test_np[0])

[False  True False  True  True False  True  True False  True False False
 False False False False False False False False False False  True False
 False  True False False False False False False False False  True  True
  True False False False False False  True False False False  True  True
 False False False False False False False False  True False False False
 False False False False False  True False False  True False  True  True
  True  True  True]
[False  True False  True  True False  True  True False  True False False
 False False False False False False False False False False  True False
 False  True False False False False False False False False  True  True
  True False False False False False  True False False False  True  True
 False False False False False False False False  True False False False
 False False False False False  True False False  True False  True  True
  True  True  True]
[False  True False  True  True False  True  True False  True False False
 False Fals

#### Mask based on importance

In [15]:
target = 'CAD'
rf = RandomForestClassifier(random_state=2022)
# imbalanced
# X_train = pd.read_csv(join(FEATURES, 'cardiac_features_train_imputed_noOH_tabular_imaging_reordered.csv'), header=None)
# X_test = pd.read_csv(join(FEATURES, f'cardiac_features_test_imputed_noOH_tabular_imaging_reordered.csv'), header=None)
# y_train = torch.load(join(FEATURES, f'cardiac_labels_{target}_train.pt'))
# y_test = torch.load(join(FEATURES, f'cardiac_labels_{target}_test.pt'))

# balanced
X_train = pd.read_csv(join(FEATURES, f'cardiac_features_train_imputed_noOH_tabular_imaging_{target}_balanced_reordered.csv'), header=None)
X_test = pd.read_csv(join(FEATURES, f'cardiac_features_test_imputed_noOH_tabular_imaging_reordered.csv'), header=None)
y_train = torch.load(join(FEATURES, f'cardiac_labels_{target}_train_balanced.pt'))
y_test = torch.load(join(FEATURES, f'cardiac_labels_{target}_test.pt'))
rf.fit(X_train, y_train)
# Predict probabilities for the test dataset
y_pred_proba = rf.predict_proba(X_test)[:, 1]

# Calculate AUC
auc = roc_auc_score(y_test, y_pred_proba)

print(f"AUC on test dataset: {auc}")


AUC on test dataset: 0.8547195217018755


In [16]:
data_df = pd.read_csv(join(dirname(FEATURES), f'cardiac_feature_18545_vector_labeled_noOH_dropNI_imputed.csv'),nrows=5)
field_lengths_tabular = torch.load(join(dirname(FEATURES), 'tabular_lengths.pt'))
categorical_ids = []
continuous_ids = []
for i in range(len(field_lengths_tabular)):
    if field_lengths_tabular[i] == 1:
        continuous_ids.append(i)
    else:
        categorical_ids.append(i)
column_name = data_df.columns[1:]
column_name = column_name[categorical_ids+continuous_ids]
# print(column_name)

# Get feature importances
importances = rf.feature_importances_
# Sort feature importances in descending order
MI_indices = np.argsort(importances)[::-1]
LI_indices = np.argsort(importances)
# Get feature names
MI_feature_name = column_name[MI_indices]
print(MI_feature_name)

Index(['Cholesterol lowering medication regularly taken',
       'Blood pressure medication regularly taken',
       'Angina diagnosed by doctor', 'LVM (g)', 'Waist circumference-2.0',
       'Diastolic blood pressure-2.mean', 'Body mass index (BMI)-2.0',
       'Pulse rate-2.mean', 'Systolic blood pressure-2.mean', 'LVEF (%)',
       'Body fat percentage-2.0', 'RVSV (mL)',
       'Long-standing illness, disability or infirmity-2.0', 'Weight-2.0',
       'Cardiac index-2.0', 'P duration-2.0', 'LVESV (mL)',
       'Pulse wave Arterial Stiffness index-2.0', 'LVEDV (mL)',
       'Basal metabolic rate-2.0', 'QRS duration-2.0', 'RVEDV (mL)',
       'RVEF (%)', 'Central augmentation pressure during PWA-2.0',
       'Augmentation index for PWA-2.0', 'RVESV (mL)',
       'Heart rate during PWA-2.0', 'LVSV (mL)', 'LVCO (L/min)',
       'Whole body fat mass-2.0', 'End systolic pressure index during PWA-2.0',
       'Cardiac output during PWA-2.0', 'Cardiac output-2.0',
       'Average heart rate

In [21]:
missing_rate = 0.9
missing_strategy = 'MI'

train_name = 'cardiac_features_train_imputed_noOH_tabular_imaging_reordered.csv'
val_name = 'cardiac_features_val_imputed_noOH_tabular_imaging_reordered.csv'
test_name = 'cardiac_features_test_imputed_noOH_tabular_imaging_reordered.csv'
for name, split in zip([train_name, val_name, test_name], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    # print(path)
    create_certain_missing_mask(path, save_mask_path, MI_indices, missing_strategy, missing_rate)

balanced_train_name = f'cardiac_features_train_imputed_noOH_tabular_imaging_{target}_balanced_reordered.csv'
balanced_path = join(FEATURES, balanced_train_name)
balanced_save_mask_path = join(MASK_PATH, f'{balanced_train_name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
create_certain_missing_mask(balanced_path, balanced_save_mask_path, MI_indices, missing_strategy, missing_rate)

missing_strategy = 'LI'
train_name = 'cardiac_features_train_imputed_noOH_tabular_imaging_reordered.csv'
val_name = 'cardiac_features_val_imputed_noOH_tabular_imaging_reordered.csv'
test_name = 'cardiac_features_test_imputed_noOH_tabular_imaging_reordered.csv'
for name, split in zip([train_name, val_name, test_name], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    # print(path)
    create_certain_missing_mask(path, save_mask_path, LI_indices, missing_strategy, missing_rate)

balanced_train_name = f'cardiac_features_train_imputed_noOH_tabular_imaging_{target}_balanced_reordered.csv'
balanced_path = join(FEATURES, balanced_train_name)
balanced_save_mask_path = join(MASK_PATH, f'{balanced_train_name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
result = create_certain_missing_mask(balanced_path, balanced_save_mask_path, LI_indices, missing_strategy, missing_rate)

data tabular shape: (26040, 75)
Real missing rate: 0.8933333333333333
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_train_imputed_noOH_tabular_imaging_reordered_CAD_MI_0.9.npy
data tabular shape: (6510, 75)
Real missing rate: 0.8933333333333333
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_val_imputed_noOH_tabular_imaging_reordered_CAD_MI_0.9.npy
data tabular shape: (3617, 75)
Real missing rate: 0.8933333333333333
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_test_imputed_noOH_tabular_imaging_reordered_CAD_MI_0.9.npy
data tabular shape: (3482, 75)
Real missing rate: 0.8933333333333333
Save missing mask to /bigdata/siyi/data/UKBB/cardiac_segmentations/projects/SelfSuperBio/18545/final/missing_mask/cardiac_features_train_imputed_noOH_tab

In [3]:
# Check train, val, test to miss the same columns
train_np = np.load(join(MASK_PATH, 'cardiac_features_train_imputed_noOH_tabular_imaging_CAD_balanced_reordered_CAD_MI_0.9.npy'))
val_np = np.load(join(MASK_PATH, 'cardiac_features_val_imputed_noOH_tabular_imaging_reordered_CAD_MI_0.9.npy'))
test_np = np.load(join(MASK_PATH, 'cardiac_features_test_imputed_noOH_tabular_imaging_reordered_CAD_MI_0.9.npy'))
print(np.where(train_np[0]))
print(np.where(val_np[0]))
print(np.where(test_np[0]))

(array([ 0,  1,  3,  5,  6,  7, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24,
       25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
       59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]),)
(array([ 0,  1,  3,  5,  6,  7, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24,
       25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
       59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]),)
(array([ 0,  1,  3,  5,  6,  7, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24,
       25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
       59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]),)


## DVM

In [4]:
# TODO: change to your own path
FEATURES = '/bigdata/siyi/data/DVM/features'
MASK_PATH = join(FEATURES, 'missing_mask')

#### Random mask

In [5]:
missing_strategy = 'value'
missing_rate = 0.0
target = 'dvm'

train_name = 'dvm_features_train_noOH_all_views_physical_jittered_50_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_physical_jittered_50_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_physical_jittered_50_reordered.csv'
for name, seed, split in zip([train_name, val_name, test_name], [2021,2022,2023], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    # print(path)
    create_missing_mask(path, save_mask_path, seed, missing_strategy, missing_rate)

data tabular shape: (70565, 17)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_train_noOH_all_views_physical_jittered_50_reordered_dvm_value_0.0.npy
data tabular shape: (17642, 17)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_val_noOH_all_views_physical_jittered_50_reordered_dvm_value_0.0.npy
data tabular shape: (88207, 17)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_test_noOH_all_views_physical_jittered_50_reordered_dvm_value_0.0.npy


In [6]:
missing_strategy = 'feature'

train_name = 'dvm_features_train_noOH_all_views_physical_jittered_50_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_physical_jittered_50_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_physical_jittered_50_reordered.csv'
for name, seed, split in zip([train_name, val_name, test_name], [2022,2022,2022], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    # print(path)
    create_missing_mask(path, save_mask_path, seed, missing_strategy, missing_rate)

data tabular shape: (70565, 17)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_train_noOH_all_views_physical_jittered_50_reordered_dvm_feature_0.0.npy
data tabular shape: (17642, 17)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_val_noOH_all_views_physical_jittered_50_reordered_dvm_feature_0.0.npy
data tabular shape: (88207, 17)
Real missing rate: 0.0
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_test_noOH_all_views_physical_jittered_50_reordered_dvm_feature_0.0.npy


In [57]:
# Check train, val, test to miss the same columns
train_np = np.load(join(MASK_PATH, f'{train_name[:-4]}_dvm_feature_0.3.npy'))
val_np = np.load(join(MASK_PATH, f'{val_name[:-4]}_dvm_feature_0.3.npy'))
test_np = np.load(join(MASK_PATH, f'{test_name[:-4]}_dvm_feature_0.3.npy'))
print(train_np[0])
print(val_np[0])
print(test_np[0])

[False False False  True  True False  True False False  True False  True
 False False False False False]
[False False False  True  True False  True False False  True False  True
 False False False False False]
[False False False  True  True False  True False False  True False  True
 False False False False False]


#### Mask based on importance

In [58]:
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier(random_state=2022)
X_train = pd.read_csv(join(FEATURES, 'dvm_features_train_noOH_all_views_physical_jittered_50_reordered.csv'), header=None)
X_test = pd.read_csv(join(FEATURES, f'dvm_features_test_noOH_all_views_physical_jittered_50_reordered.csv'), header=None)
y_train = torch.load(join(FEATURES, 'labels_model_all_train_all_views.pt'))
y_test = torch.load(join(FEATURES, 'labels_model_all_test_all_views.pt'))
rf.fit(X_train, y_train)

# Predict classes for the test dataset
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy on test dataset: {accuracy}")

Accuracy on test dataset: 0.985273277631027


In [59]:
reordered_column_name = ['Color', 'Bodytype', 'Gearbox','Fuel_type' ,
               'Wheelbase', 'Height', 'Width', 'Length', 'Adv_year', 'Adv_month',
       'Reg_year', 'Runned_Miles', 'Price', 'Seat_num', 'Door_num',
       'Entry_price', 'Engine_size']

# Get feature importances
importances = rf.feature_importances_
# Sort feature importances in descending order
MI_indices = np.argsort(importances)[::-1]
LI_indices = np.argsort(importances)
# Get feature names
MI_feature_name = [reordered_column_name[x] for x in MI_indices]
print(MI_feature_name)

['Entry_price', 'Length', 'Width', 'Wheelbase', 'Engine_size', 'Height', 'Reg_year', 'Price', 'Bodytype', 'Runned_Miles', 'Seat_num', 'Door_num', 'Adv_month', 'Fuel_type', 'Color', 'Gearbox', 'Adv_year']


In [64]:
missing_rate = 0.1

missing_strategy = 'MI'
train_name = 'dvm_features_train_noOH_all_views_physical_jittered_50_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_physical_jittered_50_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_physical_jittered_50_reordered.csv'
for name, split in zip([train_name, val_name, test_name], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    create_certain_missing_mask(path, save_mask_path, MI_indices, missing_strategy, missing_rate)

missing_strategy = 'LI'
train_name = 'dvm_features_train_noOH_all_views_physical_jittered_50_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_physical_jittered_50_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_physical_jittered_50_reordered.csv'
for name, split in zip([train_name, val_name, test_name], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    create_certain_missing_mask(path, save_mask_path, LI_indices, missing_strategy, missing_rate)

data tabular shape: (70565, 17)
Real missing rate: 0.058823529411764705
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_train_noOH_all_views_physical_jittered_50_reordered_dvm_MI_0.1.npy
data tabular shape: (17642, 17)
Real missing rate: 0.058823529411764705
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_val_noOH_all_views_physical_jittered_50_reordered_dvm_MI_0.1.npy
data tabular shape: (88207, 17)
Real missing rate: 0.058823529411764705
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_test_noOH_all_views_physical_jittered_50_reordered_dvm_MI_0.1.npy
data tabular shape: (70565, 17)
Real missing rate: 0.058823529411764705
Save missing mask to /bigdata/siyi/data/DVM/features/missing_mask/dvm_features_train_noOH_all_views_physical_jittered_50_reordered_dvm_LI_0.1.npy
data tabular shape: (17642, 17)
Real missing rate: 0.058823529411764705
Save missing mask to /bigdata/siyi/data/DVM/features/missin

In [65]:
train_np = np.load(join(MASK_PATH, f'{train_name[:-4]}_dvm_MI_0.3.npy'))
val_np = np.load(join(MASK_PATH, f'{val_name[:-4]}_dvm_MI_0.3.npy'))
test_np = np.load(join(MASK_PATH, f'{test_name[:-4]}_dvm_MI_0.3.npy'))
print(train_np[0])
print(val_np[0])
print(test_np[0])

[False False False False  True False  True  True False False False False
 False False False  True  True]
[False False False False  True False  True  True False False False False
 False False False  True  True]
[False False False False  True False  True  True False False False False
 False False False  True  True]
