In [1]:
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scanpy as sc
import src.mvae.mt.mvae.utils as utils
import torch
import yaml

from scipy.io import mmread, mmwrite
from scipy.sparse import csr_matrix, save_npz, vstack
from src.lightning.gene import GeneModule
from src.mvae.mt.data import GeneDataset
from src.mvae.mt.mvae.distributions import *
from src.mvae.mt.mvae.models.gene_vae import GeneVAE
from src.mvae.mt.mvae.ops.hyperbolics import lorentz_to_poincare
from src.mvae.mt.mvae.ops.spherical import spherical_to_projected
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [2]:
data_dir = "/home/romainlhardy/Downloads/merlin_cxg_2023_05_15_sf-log1p/train"
save_dir = "/home/romainlhardy/code/hyperbolic-cancer/data/cellxgene"
os.makedirs(save_dir, exist_ok=True)
parquet_files = os.listdir(data_dir)

In [3]:
# Select highly variable genes
file_samples = 5
sampled_files = np.random.choice(parquet_files, size=file_samples)

mtx = None
for i, file in enumerate(sampled_files):
    df = pd.read_parquet(os.path.join(data_dir, file), engine="pyarrow", use_threads=True)
    if not isinstance(df.X.iloc[0], np.ndarray):
        arrays = [np.array(x) for x in df.X]
    else:
        arrays = df.X.values

    try:
        X = np.stack(arrays, axis=0)
        X = np.expm1(X)
        X_sparse = csr_matrix(X)
    except ValueError as e:
        print(f"Error processing file {file} at index {i}: {e}")
        continue
    except TypeError as e:
         print(f"Error processing file {file} at index {i} (TypeError, possibly non-numeric data in X): {e}")
         continue
    
    mtx = X_sparse if mtx is None else vstack([mtx, X_sparse])

n_cells, n_genes = X.shape
adata = ad.AnnData(X=X)
adata.var_names = [f"g{i}" for i in range(n_genes)]
adata.obs_names = [f"c{i}" for i in range(n_cells)]

sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=5000)

highly_variable_mask = adata.var.highly_variable.values
np.save(os.path.join(save_dir, "highly_variable_mask.npy"), highly_variable_mask)

In [None]:
file_samples = 50
sampled_files = np.random.choice(parquet_files, size=file_samples)
batch_columns = ["dataset_id", "donor_id"]
mtx = None
batch = None
cell_type = None

for i, file in enumerate(tqdm(sampled_files)):
    df = pd.read_parquet(os.path.join(data_dir, file), engine="pyarrow", use_threads=True)
    if not isinstance(df.X.iloc[0], np.ndarray):
        arrays = [np.array(x) for x in df.X]
    else:
        arrays = df.X.values

    try:
        X = np.stack(arrays, axis=0).astype(np.float32)
        X = np.expm1(X)[:, highly_variable_mask]
        X_sparse = csr_matrix(X)
    except ValueError as e:
        print(f"Error processing file {file} at index {i}: {e}")
        continue
    except TypeError as e:
         print(f"Error processing file {file} at index {i} (TypeError, possibly non-numeric data in X): {e}")
         continue

    b = df[batch_columns].values
    c = df["cell_type"].values

    mtx = X_sparse if mtx is None else vstack([mtx, X_sparse])
    batch = b if batch is None else np.vstack([batch, b])
    cell_type = c if cell_type is None else np.concatenate([cell_type, c])

if mtx is not None:
    print(f"Finished processing {len(sampled_files)} files. Saving final data...")
    try:
        mtx_path = os.path.join(save_dir, "cellxgene_counts.mtx")
        batch_path = os.path.join(save_dir, "cellxgene_batch.tsv")
        cell_type_path = os.path.join(save_dir, "cellxgene_cell_type.tsv")

        mmwrite(mtx_path, mtx.T)
        np.savetxt(batch_path, batch, delimiter="\t")
        np.savetxt(cell_type_path, cell_type, delimiter="\t")
        print("Data saved successfully.")
    except Exception as e:
        print(f"Error saving final data: {e}")
else:
    print("No data processed or accumulated.")