In [1]:
import os
import numpy as np
import pandas as pd

In [None]:
import countdiff

In [3]:
os.chdir(os.path.join(countdiff.__path__[0], '..', '..'))

In [4]:
%load_ext autoreload
%autoreload 2

from countdiff.data.process_scrna import SingleCellDataset
from countdiff.data.generate_masks import MNAR_mask_high, MNAR_mask_low, MAR_mask_10, MAR_mask_25

In [9]:
np.random.seed(42)
dataset_name = "zero_shot"
if dataset_name == "heart":
    test_set = SingleCellDataset("data/dnadiff/filtered_heart_data.hdf5", "test", ["batch", "cell_type", "gender", "age"])
elif dataset_name == "fetus":
    test_set = SingleCellDataset("data/dnadiff/filtered_hca_data.hdf5", "test", ["cell_type", "disease", "sex", "development_day", "batch"])
elif dataset_name == "zero_shot":
    test_set = SingleCellDataset("data/dnadiff/zero_shot_heart_data.hdf5", "test", ["batch", "cell_type"])



--- Initializing Dataset for test split ---
Loading from HDF5 file: 'data/dnadiff/zero_shot_heart_data.hdf5'...
Dataset initialized.
Number of cells: 70430
Number of genes: 1000


In [14]:
dropout_rates = [0.1, 0.25, 0.5, 0.75]
masks_dir = "data/dnadiff/random_masks"
MCAR_dir = os.path.join(masks_dir, "MCAR_masks")
MAR_dir = os.path.join(masks_dir, "MAR_masks")
MNAR_high_dir = os.path.join(masks_dir, "MNAR_high_masks")
MNAR_low_dir = os.path.join(masks_dir, "MNAR_low_masks")

In [15]:
np.random.seed(42)

Generating MCAR masks

In [16]:
np.random.seed(42)
for dpr in dropout_rates:
    mask = np.random.rand(*test_set.counts.shape) < dpr
    print(mask[:, :10].mean()) # Should be the same every time if seed is fixed
    mask_path = os.path.join(MCAR_dir, f"{dataset_name}_dropout_{dpr}.npy")
    if not os.path.exists(MCAR_dir):
        os.makedirs(MCAR_dir)
    np.save(mask_path, mask)

0.10020379130588879
0.24980260793801595
0.5000977115319297
0.7499761258977401


In [9]:
np.random.seed(42)
for dpr in dropout_rates:
    mask = np.random.rand(*test_set.counts.shape) < dpr
    print(mask[:, :10].mean()) # Should be the same every time if seed is fixed
    mask_path = os.path.join(MCAR_dir, f"zero_shot_dropout_{dpr}.npy")
    if not os.path.exists(MCAR_dir):
        os.makedirs(MCAR_dir)
    np.save(mask_path, mask)

0.09996308391310521
0.2504231151497941
0.5005296038619906
0.7506985659520091


Generating MNAR/MAR masks

In [13]:
from countdiff.data.generate_masks import MNAR_mask_low, MNAR_mask_high, MAR_mask_10, MAR_mask_25

In [14]:
test_set_shape = test_set.counts.shape

In [15]:
mask_path = os.path.join(MAR_dir, f"{dataset_name}_dropout_0.1.npy")
mask = MAR_mask_10(test_set_shape)
mask2 = MAR_mask_25(test_set_shape)
if not os.path.exists(MAR_dir):
    os.makedirs(MAR_dir)
np.save(mask_path, mask)
mask_path2 = os.path.join(MAR_dir, f"{dataset_name}_dropout_0.25.npy")
np.save(mask_path2, mask2)

0.020002839698991907
0.06999929007525202
0.09999787022575607


In [13]:
np.random.seed(42)

for dropout_rates in [0.1, 0.25]:
    high_mask = MNAR_mask_high(test_set.counts.numpy(), dropout_rates, 5000, ~test_set.missingness_mask.numpy())
    print(high_mask.sum()/(~(test_set.missingness_mask).numpy()).astype(int).sum())
    print(test_set.counts[high_mask].mean())
    high_mask_path = os.path.join(MNAR_high_dir, f"{dataset_name}_dropout_{dropout_rates}.npy")
    if not os.path.exists(MNAR_high_dir):
        os.makedirs(MNAR_high_dir)
    np.save(high_mask_path, high_mask)
    
    low_mask = MNAR_mask_low(test_set.counts.numpy(), dropout_rates, 5000, ~test_set.missingness_mask.numpy())
    print(low_mask.sum()/(~(test_set.missingness_mask).numpy()).astype(int).sum())
    print(test_set.counts[low_mask].mean())
    low_mask_path = os.path.join(MNAR_low_dir, f"{dataset_name}_dropout_{dropout_rates}.npy")
    if not os.path.exists(MNAR_low_dir):
        os.makedirs(MNAR_low_dir)
    np.save(low_mask_path, low_mask)


0.0999666913343996
tensor(3.7680)
0.0999666913343996
tensor(0.9212)
0.24997122858469206
tensor(2.9880)
0.24997122858469206
tensor(0.9668)
