In [1]:
import sys
import os
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

import numpy as np
import pandas as pd
from GENIE3.GENIE3 import *
from sklearn.metrics import roc_auc_score
from utils import gt_benchmark, precision_at_k
import SERGIO.sergio as sergio
import re
import os
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm
import hashlib

import tensorflow as tf
print("gpu available: ", tf.config.list_physical_devices('GPU'))
# !pwd

2024-09-15 12:35:26.314144: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-15 12:35:26.316194: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-15 12:35:26.358569: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-15 12:35:26.359471: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


gpu available:  []


2024-09-15 12:35:28.221373: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
def parse_dataset_name(folder_name):
    pattern1 = r'De-noised_(\d+)G_(\d+)T_(\d+)cPerT_dynamics_(\d+)_DS(\d+)'
    pattern2 = r'De-noised_(\d+)G_(\d+)T_(\d+)cPerT_(\d+)_DS(\d+)'
    match_p1 = re.match(pattern1, folder_name)
    match_p2 = re.match(pattern2, folder_name)
    if match_p1:
        return {
            'number_genes': int(match_p1.group(1)),
            'number_bins': int(match_p1.group(2)),
            'number_sc': int(match_p1.group(3)),
            'dynamics': int(match_p1.group(4)),
            'dataset_id': int(match_p1.group(5)),
            "pattern": "De-noised_{number_genes}G_{number_bins}T_{number_sc}cPerT_dynamics_{dynamics}_DS{dataset_id}"
        }
    if match_p2:
        return {
            'number_genes': int(match_p2.group(1)),
            'number_bins': int(match_p2.group(2)),
            'number_sc': int(match_p2.group(3)),
            'dynamics': int(match_p2.group(4)),
            'dataset_id': int(match_p2.group(5)),
            "pattern": "De-noised_{number_genes}G_{number_bins}T_{number_sc}cPerT_{dynamics}_DS{dataset_id}"
        }
    return

def get_datasets():
    datasets = []
    for folder_name in os.listdir('./data_sets'):
        dataset_info = parse_dataset_name(folder_name)
        if dataset_info:
            datasets.append(dataset_info)
    return sorted(datasets, key=lambda x: x['dataset_id'])

def fstr(template):
    return eval(f'f"""{template}"""')

def experiment(data_info):
    sim = sergio.sergio(
        number_genes=data_info["number_genes"],
        number_bins=data_info["number_bins"], 
        number_sc=data_info["number_sc"],
        noise_params=1,
        decays=0.8, 
        sampling_state=15,
        noise_type='dpd'
    )
    # sim.build_graph(input_file_taregts ='data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Interaction_cID_6.txt',\
    #                 input_file_regs='data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Regs_cID_6.txt', shared_coop_state=2)
    number_genes = data_info["number_genes"]
    number_bins = data_info["number_bins"]
    number_sc = data_info["number_sc"]
    dynamics = data_info["dynamics"]
    dataset_id = data_info["dataset_id"]
    pattern = data_info["pattern"]
    folder_name = pattern.format(number_genes=number_genes, number_bins=number_bins, 
                                 number_sc=number_sc, dynamics=dynamics, dataset_id=dataset_id)
    input_file_targets = f'./data_sets/{folder_name}/Interaction_cID_{data_info["dynamics"]}.txt'
    input_file_regs = f'./data_sets/{folder_name}/Regs_cID_{data_info["dynamics"]}.txt'
    
    sim.build_graph(
        input_file_taregts=input_file_targets,
        input_file_regs=input_file_regs,
        shared_coop_state=2
    )
    sim.simulate()
    expr = sim.getExpressions()
    expr_clean = np.concatenate(expr, axis=1)
    return sim, expr, expr_clean

def save_data(dataset_id, expr_clean, expr, sim, iter=0):
    print(f"DS{dataset_id}: {expr_clean.shape}")
    os.makedirs('./imputation_data/DS{dataset_id}', exist_ok=True)
    np.save(f'./imputation_data/DS{dataset_id}/DS6_clean_iter_{iter}', expr_clean)
    np.save(f'./imputation_data/DS{dataset_id}/DS6_expr_iter_{iter}', expr)
    cmat_clean = sim.convert_to_UMIcounts(expr)
    cmat_clean = np.concatenate(cmat_clean, axis=1)
    np.save(f'./imputation_data/DS{dataset_id}/DS6_clean_counts_iter_{iter}', cmat_clean)

def sparse_ratio(data):
    # ndarray
    return 1 - np.count_nonzero(data) / data.size

def get_sparsity_of_binary_ind(sim, expr, expr_clean, percentile=45, dataset_id=6, iter=0):
    """
    Add outlier genes
    """
    expr_O = sim.outlier_effect(expr, outlier_prob = 0.01, mean = 5, scale = 1)

    """
    Add Library Size Effect
    """
    libFactor, expr_O_L = sim.lib_size_effect(expr_O, mean = 4.5, scale = 0.7)

    """
    Add Dropouts
    """
    binary_ind = sim.dropout_indicator(expr_O_L, shape = 8, percentile = percentile)
    expr_O_L_D = np.multiply(binary_ind, expr_O_L)

    """
    Convert to UMI count
    """
    count_matrix = sim.convert_to_UMIcounts(expr_O_L_D)

    """
    Make a 2d gene expression matrix
    """
    count_matrix = np.concatenate(count_matrix, axis = 1)
    os.makedirs(os.path.dirname(f'./imputation_data/DS{dataset_id}/DS6_45'), exist_ok=True)
    np.save(f'./imputation_data/DS{dataset_id}/DS6_45_iter_{iter}', count_matrix)
    print(count_matrix.shape)
    return sparse_ratio(binary_ind), expr_O, libFactor, expr_O_L, binary_ind, count_matrix

def compute_checksum(data):
    if isinstance(data, np.ndarray):
        return hashlib.md5(data.tobytes()).hexdigest()
    elif isinstance(data, list):
        return hashlib.md5(str(data).encode()).hexdigest()
    else:
        return hashlib.md5(str(data).encode()).hexdigest()

def compute_stats(data):
    if isinstance(data, np.ndarray):
        return {
            'checksum': compute_checksum(data),
            'mean': np.mean(data),
            'std': np.std(data)
        }
    elif isinstance(data, list):
        return {
            'checksum': compute_checksum(data),
            'mean': np.mean(data),
            'std': np.std(data)
        }
    else:
        return {
            'checksum': compute_checksum(data),
            'value': data
        }

def compare_attempts(attempts):
    comparison = {}
    for key in attempts[0].keys():
        values = [attempt[key]['checksum'] for attempt in attempts]
        comparison[key] = len(set(values)) > 1
    return comparison

In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

def run_exp1(x_path, y_path, ind):
    #reload_modules('tensorflow.compat')
    tf = importlib.import_module('tensorflow.compat.v1')
    ds_str = 'DS' + str(ind)
    save_path = './imputation_data/' + ds_str
    print(f"Loading data for DS{ind}")
    x = np.transpose(np.load(x_path))  # Clean data
    y = np.transpose(np.load(y_path))  # Noisy data
    # get num of cluters
    n_clusters = get_num_cell_types(y, cells_per_cluster=300)
    # perform clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(y)

    print(f"Running Exp1 imputation for DS{ind}")
    imputed_y = np.copy(y)
    for cluster in np.unique(clusters):
        cluster_indices = np.where(clusters == cluster)[0]
        cluster_data = imputed_y[cluster_indices, :]
        for gene_idx in range(cluster_data.shape[1]):
            gene_values = cluster_data[:, gene_idx]
            non_zero_values = gene_values[gene_values != 0] # exclude missing values
            if len(non_zero_values) > 0:
                mean_gene = np.mean(non_zero_values)
                std_gene = np.std(non_zero_values)
                missing_indices = np.where(gene_values == 0)[0] # zeros are missing values
                imputed_values = np.random.normal(mean_gene, std_gene, size=missing_indices.shape[0])
                imputed_y[cluster_indices[missing_indices], gene_idx] = imputed_values

    print(f"Saving imputed data for DS{ind}")
    save_str = '/yhat_exp1'
    np.save(save_path + save_str, imputed_y)
    print(f"Exp1 imputation completed for DS{ind} and results saved.")

def run_exp1_wo_cluster(x_path, y_path, ind):
    ds_str = 'DS' + str(ind)
    save_path = './imputation_data/' + ds_str

    # load clean and noisy data
    print(f"Loading data for DS{ind}")
    x_clean = np.transpose(np.load(x_path))
    y_noisy = np.transpose(np.load(y_path))
    total_rows = y_noisy.shape[0]
    num_blocks = total_rows // 300
    block_means = np.zeros((num_blocks, y_noisy.shape[1]))
    block_variances = np.zeros((num_blocks, y_noisy.shape[1]))

    imputed_y = np.copy(y_noisy)
    intra_block_mse = []
    inter_block_mse = []

    for block in range(num_blocks):
        start_row = block * 300
        end_row = start_row + 300

        # verify the means of each row in the block
        print(f"Verifying means and variances of rows {start_row} to {end_row} in clean data")
        block_means[block, :] = np.mean(x_clean[start_row:end_row, :], axis=0)
        # print(f"Means of genes in block {block + 1}: {block_means[block, :]}")
        block_variances[block, :] = np.var(x_clean[start_row:end_row, :], axis=0)
        # print(f"Variances of genes in block {block + 1}: {block_variances[block, :]}")

        # verify means of 300 rows in clean data
        set_means = []
        for i in range(5):
            subset = x_clean[start_row + i*60: start_row + (i+1)*60, :]
            mean_subset = np.mean(subset, axis=0)
            set_means.append(mean_subset)
            # print(f"Mean of set {i+1} in block {block + 1}: {mean_subset}")

        # Intra-block MSE, compare mean vectors of sets within the same block
        for i in range(5):
            for j in range(i + 1, 5):
                mse = mean_squared_error(set_means[i], set_means[j])
                intra_block_mse.append(mse)
                print(f"Intra-block MSE between set {i+1} and set {j+1} in block {block + 1}: {mse}")
        
        # Inter-block MSE, compare mean vectors of this block with the next block
        if block < num_blocks - 1:
            next_block_start = (block + 1) * 300
            next_block_end = next_block_start + 300
            next_block_means = []
            for i in range(5):
                next_subset = x_clean[next_block_start + i*60: next_block_start + (i+1)*60, :]
                next_mean_subset = np.mean(next_subset, axis=0)
                next_block_means.append(next_mean_subset)

            for i in range(5):
                mse = mean_squared_error(set_means[i], next_block_means[i])
                inter_block_mse.append(mse)
                print(f"Inter-block MSE between set {i+1} of block {block + 1} and set {i+1} of block {block + 2}: {mse}")
        
        for i in range(5):
            subset = x_clean[start_row + i*60 : start_row + (i+1)*60, :]
            # Initialize arrays to store means and variances for each column
            mean_subset = np.zeros(subset.shape[1])
            variance_subset = np.zeros(subset.shape[1])
            for col_idx in range(subset.shape[1]):
                non_zero_values = subset[:, col_idx][subset[:, col_idx] != 0]
                if len(non_zero_values) > 0:
                    mean_subset[col_idx] = np.mean(non_zero_values) 
                    variance_subset[col_idx] = np.var(non_zero_values)
            # print(f"Mean of set {i+1} in block {block + 1}: {mean_subset}, Variance: {variance_subset}")
       
        print(f"Total zeros in block {block + 1} before imputation: {np.sum(y_noisy[start_row:end_row, :] == 0)}")
        # compute mean and variance for each gene from the noisy data for each 300 rows
        print(f"Computing mean and variance from the noisy data for rows {start_row} to {end_row}")
        y_300 = y_noisy[start_row:end_row, :]
        means = np.zeros(y_300.shape[1])
        variances = np.zeros(y_300.shape[1])
        min_val = np.zeros(y_300.shape[1])
        max_val = np.zeros(y_300.shape[1])
        for gene_idx in range(y_300.shape[1]):
            non_zero_values = y_300[:, gene_idx][y_300[:, gene_idx] != 0]
            # print(f"Gene {gene_idx} has {len(non_zero_values)} non-zero values")
            if len(non_zero_values) > 0:
                means[gene_idx] = np.mean(non_zero_values)
                variances[gene_idx] = np.var(non_zero_values)
                min_val[gene_idx] = np.min(non_zero_values)
                max_val[gene_idx] = np.max(non_zero_values)
        # imputation by fill in zeros with normal distribution
        print(f"Imputing missing values (zeros) for rows {start_row} to {end_row}")
        block_imputed_y = np.copy(y_noisy[start_row:end_row, :])
        for gene_idx in range(block_imputed_y.shape[1]):
            missing_indices = np.where(block_imputed_y[:, gene_idx] == 0)[0]
            # print(f"Gene {gene_idx} has {len(missing_indices)} missing values")
            if len(missing_indices) > 0:
                imputed_values = np.random.normal(means[gene_idx], np.sqrt(variances[gene_idx]), size=len(missing_indices))
                imputed_values = np.clip(imputed_values, min_val[gene_idx], max_val[gene_idx])
                print(f"Imputed values for gene {gene_idx}: min={np.min(imputed_values)}, max={np.max(imputed_values)}")
                block_imputed_y[missing_indices, gene_idx] = imputed_values

        if np.any(block_imputed_y == 0):
            print(f"Warning: Block {block + 1} still contains zero values after imputation!")

        imputed_y[start_row:end_row, :] = block_imputed_y

    print(f"imputed y in the method is: {imputed_y}")
    print(f"Saving final imputed data for DS{ind}")
    save_str = '/yhat_exp1.npy'
    np.save(save_path + save_str, imputed_y)
    print(f"Exp1 imputation completed for DS{ind} and final dataset saved.")

In [4]:
def get_num_cell_types(y, cells_per_cluster):
    total_cells = y.shape[0]
    n_clusters = max(1, total_cells // cells_per_cluster)
    return n_clusters

In [6]:
import importlib
import json

datasets = get_datasets()
for k, dataset in tqdm(enumerate(datasets)):
    i = k + 1
    #  dataset info
    dataset_id = dataset['dataset_id']
    number_genes = dataset["number_genes"]
    number_bins = dataset["number_bins"]
    number_sc = dataset["number_sc"]
    dynamics = dataset["dynamics"]
    dataset_id = dataset["dataset_id"]
    pattern = dataset["pattern"]
    folder_name = pattern.format(number_genes=number_genes, number_bins=number_bins, 
                                 number_sc=number_sc, dynamics=dynamics, dataset_id=dataset_id)
    target_file = f'./data_sets/{folder_name}/Interaction_cID_{dataset["dynamics"]}.txt'
    input_file = f'./data_sets/{folder_name}/Regs_cID_{dataset["dynamics"]}.txt'

    # run SERGIO
    # sim, expr, expr_clean = experiment(dataset)
    # ratio, expr_O, libFactor, expr_O_L, binary_ind, count_matrix = get_sparsity_of_binary_ind(sim, expr, expr_clean, percentile=percentile, dataset_id=dataset_id)
    
    # prepare genie3
    individual_results = {}
    if i == 1:   
        target_file = './data_sets/De-noised_100G_9T_300cPerT_4_DS1/Interaction_cID_4.txt'
        regs_path = './data_sets/De-noised_100G_9T_300cPerT_4_DS1/Regs_cID_4.txt'
    elif i == 2:
        target_file = './data_sets/De-noised_400G_9T_300cPerT_5_DS2/Interaction_cID_5.txt'
        regs_path = './data_sets/De-noised_400G_9T_300cPerT_5_DS2/Regs_cID_5.txt'
    else:
        target_file = './data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Interaction_cID_6.txt'
        regs_path = './data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Regs_cID_6.txt'
    ds_str = 'DS' + str(i)
    save_path = './imputation_data/' + ds_str
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # run Exp1 imputation method
    print(f"---> Running Exp1 on DS{i}")
    # TODO: change iter num
    for iter_num in range(1):
        # run_exp1(save_path + f'/DS6_clean_iter_{iter_num}.npy', save_path + f'/DS6_45_iter_{iter_num}.npy', i)
        run_exp1_wo_cluster(save_path + f'/DS6_clean_iter_{iter_num}.npy', save_path + f'/DS6_45_iter_{iter_num}.npy', i)

        # load nosiy and clean data
        y = np.transpose(np.load(save_path + f'/DS6_45_iter_{iter_num}.npy'))
        print("what is y?: ", y)
        x = np.transpose(np.load(save_path + f'/DS6_clean_iter_{iter_num}.npy'))
        
        # get true regulator genes from SERGIO data
        reg_file = None
        if i == 1:
            reg_file = './data_sets/De-noised_100G_9T_300cPerT_4_DS1/Regs_cID_4.txt'
        elif i == 2:
            reg_file = './data_sets/De-noised_400G_9T_300cPerT_5_DS2/Regs_cID_5.txt'
        else:
            reg_file = './data_sets/De-noised_1200G_9T_300cPerT_6_DS3/Regs_cID_6.txt'
        master_regs = pd.read_table(reg_file, header=None, sep=',')
        master_regs = master_regs[0].values.astype(int).astype(str).tolist()

        regulators = []
        regulator_file = open(target_file, 'r')
        lines = regulator_file.readlines()
        for line in lines:
            row = line.split(',')
            num_regs_row = int(float(row[1]))
            if num_regs_row != 0:
                for i in range(2, num_regs_row + 2):
                    regulators.append(str(int(float(row[i]))))
        regs = list(set(regulators))
        regs = [i for i in regs if i not in master_regs]

        # Run GENIE3 on Clean Data
        print(f"---> Running GENIE3 on Clean Data for DS{i}")
        gene_names = [str(i) for i in range(x.shape[1])]
        # if not run_with_regs:
        regs = None
        gene_names = None

        VIM_CLEAN = GENIE3(x, nthreads=12, ntrees=100)        
        gt, rescaled_vim = gt_benchmark(VIM_CLEAN, target_file)
        # Aucroc
        roc_score = roc_auc_score(gt.flatten(), rescaled_vim.flatten())
        individual_results['DS' + str(i) + ' GENIE3 Clean ROC_AUC'] = float('%.2f'%(roc_score))
        # precision k
        # k = range(1, gt.size)
        # precision_k = precision_at_k(gt, rescaled_vim, k)
        # individual_results['DS' + str(i) + ' GENIE3 Clean Precision@k'] = precision_k

        # Run GENIE3 on Noisy Data
        print(f"---> Running GENIE3 on Noisy Data for DS{i}")
        gene_names = [str(i) for i in range(y.shape[1])]
        VIM_NOISY = GENIE3(y, nthreads=12, ntrees=100)       
        gt, rescaled_vim = gt_benchmark(VIM_NOISY, target_file)
        # Aucroc
        roc_score = roc_auc_score(gt.flatten(), rescaled_vim.flatten())
        individual_results['DS' + str(i) + ' GENIE3 Noisy ROC_AUC'] = float('%.2f'%(roc_score))
        # precision k
        # k = range(1, gt.size)
        # precision_k = precision_at_k(gt, rescaled_vim, k)
        # individual_results['DS' + str(i) + ' GENIE3 Noisy Precision@k'] = precision_k

        # Run GENIE3 on Exp1 Data, basic imputation method, no clustering, just fill in zeros with normal distribution and clip the distribution with non-zero data
        y_hat_exp1 = np.load(save_path + '/yhat_exp1.npy')
        print(y_hat_exp1)
        print(f"---> Running GENIE3 on exp1 Data for DS{i}")
        gene_names = [str(i) for i in range(y_hat_exp1.shape[1])]
        VIM_exp1 = GENIE3(y_hat_exp1, nthreads=12, ntrees=100)
        gt, rescaled_vim = gt_benchmark(VIM_exp1, target_file)
        np.save(save_path + '/VIM_exp1.npy', rescaled_vim)
        np.save(save_path + '/gt_exp1.npy', gt)
        print("saved exp1 files")
        # Aucroc
        roc_score = roc_auc_score(gt.flatten(), rescaled_vim.flatten())
        individual_results['DS' + str(i) + ' GENIE3 exp1 ROC_AUC'] = float('%.2f'%(roc_score))
        # precision k
        # k = range(1, gt.size)
        # precision_k = precision_at_k(gt, rescaled_vim, k)
        # individual_results['DS' + str(i) + ' GENIE3 exp1 Precision@k'] = precision_k
        
        # write individual results to JSON file
        if os.path.exists(save_path + '/precision_recall_data.json'):
            with open(save_path + '/precision_recall_data.json', 'r') as fp:
                existing_data = json.load(fp)
            if not isinstance(existing_data, list):
                existing_data = [existing_data]
        else:
            existing_data = []
        existing_data.append(individual_results)
        with open(save_path + '/precision_recall_data.json', 'w') as fp:
            json.dump(existing_data, fp)

0it [00:00, ?it/s]

---> Running Exp1 on DS1
Loading data for DS1





FileNotFoundError: [Errno 2] No such file or directory: './imputation_data/DS1/DS6_clean_iter_0.npy'

In [11]:
print(individual_results)

{'DS3 GENIE3 Clean ROC_AUC': 0.69, 'DS3 GENIE3 Noisy ROC_AUC': 0.46, 'DS3 GENIE3 exp1 ROC_AUC': 0.44}


{'DS3 GENIE3 Clean ROC_AUC': 0.7, 'DS3 GENIE3 Noisy ROC_AUC': 0.47, 'DS3 GENIE3 exp1 ROC_AUC': 0.46}
{'DS3 GENIE3 Clean ROC_AUC': 0.7, 'DS3 GENIE3 Noisy ROC_AUC': 0.46, 'DS3 GENIE3 exp1 ROC_AUC': 0.42}
{'DS3 GENIE3 Clean ROC_AUC': 0.72, 'DS3 GENIE3 Noisy ROC_AUC': 0.49, 'DS3 GENIE3 exp1 ROC_AUC': 0.46}
{'DS3 GENIE3 Clean ROC_AUC': 0.7, 'DS3 GENIE3 Noisy ROC_AUC': 0.48, 'DS3 GENIE3 exp1 ROC_AUC': 0.48}
{'DS3 GENIE3 Clean ROC_AUC': 0.72, 'DS3 GENIE3 Noisy ROC_AUC': 0.47, 'DS3 GENIE3 exp1 ROC_AUC': 0.45}

In [7]:
if np.any(y_hat_exp1 == 0):
    print(f"DS{i} contains missing or zero data")

In [None]:
print(y)


In [None]:
print(y_hat_exp1)
zero_count = np.count_nonzero(y_hat_exp1 == 0)
print(f"Number of zeros in y_hat_exp1: {zero_count}")

In [None]:
x = np.transpose(np.load('../previous_imputations/DS6_clean_iter_0.npy'))
print(x)
if np.any(x <= 0):
    print(f"x contains missing or zero data")
np.min(x)

In [15]:
x1 = np.transpose(np.load('./imputation_data/DS1/DS6_45_iter_0.npy'))
print(x1)
print(f"Data type of clean data: {x1.dtype}")
print(x1.shape)
np.min(x1)

x2 = np.transpose(np.load('./imputation_data/DS1/DS6_45.npy'))
print(x2)
print(f"Data type of clean data: {x2.dtype}")
print(x2.shape)
np.min(x2)

[[[ 0  2  0 ...  0  0  0]
  [ 1  2  0 ...  0  2  4]
  [ 1  2  0 ...  1  2  1]
  ...
  [ 1  3  0 ...  0  0  3]
  [ 3  5  0 ...  1  7  4]
  [ 3  0  0 ...  0  0  3]]

 [[ 0  4  4 ...  0  0  1]
  [ 2  1  2 ...  2  0  0]
  [ 0  1  3 ...  1  0  0]
  ...
  [ 1  0  3 ...  3  0  0]
  [ 1  2  8 ...  7  0  0]
  [ 0  0  3 ...  3  1  0]]

 [[ 3  3  0 ...  0  2  7]
  [ 1  0  0 ...  0  2  9]
  [ 0  0  0 ...  0  0  4]
  ...
  [ 1  1  0 ...  1  2  5]
  [ 2  4  0 ...  1  6 11]
  [ 3  0  1 ...  0  0  2]]

 ...

 [[ 0  3  0 ...  0  3  0]
  [ 1  0  1 ...  0  7  0]
  [ 0  0  0 ...  1  3  1]
  ...
  [ 0  0  0 ...  0  3  3]
  [ 1  1  1 ...  3  3  2]
  [ 1  0  0 ...  0  2  0]]

 [[ 7  0  0 ...  0  2  1]
  [ 1  2  1 ...  0  6  0]
  [ 0  1  0 ...  0  2  0]
  ...
  [ 0  1  3 ...  2  2  0]
  [ 7  1  5 ...  5  3  5]
  [ 7  1  2 ...  1  1  3]]

 [[ 0  1  0 ...  0  0  1]
  [ 1  2  1 ...  2  2  3]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 2  3  2 ...  3  1  3]
  [ 0  2  0 ...  2  4  9]
  [ 4  0  0 ...  1  0  6]]]
Data type 

0

In [None]:
import h5py

# Open the HDF5 file
with h5py.File('./HU_0270_Adipose_GSE136229_gene_count.h5', 'r') as f:
    # Access the 'matrix' group
    matrix_group = f['matrix']
    
    data = matrix_group['data'][:]
    print(f"Data type of 'data': {data.dtype}")
    print(data, len(data))
    
    data = matrix_group['indices'][:]
    print(f"Data type of 'indices': {data.dtype}")
    print(data)
    
    data = matrix_group['indptr'][:]
    print(f"Data type of 'indptr': {data.dtype}")
    print(data)
    
    data = matrix_group['shape'][:]
    print(f"Data type of 'shape': {data.dtype}")
    print(data)
    
    data = matrix_group['barcodes'][:]
    print(f"Data type of 'barcodes': {data.dtype}")
    print(data)
    
    data = matrix_group['features'][:]
    print(f"Data type of 'features': {data.dtype}")
    print(data)
