# Transformer on CITE-seq

In [1]:
import os
import gc

os.environ["NUMBA_CACHE_DIR"] = "/scratch/st-jiaruid-1/yinian/tmp/"  # https://github.com/scverse/scanpy/issues/2113
from os.path import basename, join
from os import makedirs
from pathlib import Path
import yaml

import logging
import anndata as ad
import pickle
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

import h5py
import hdf5plugin
import tables

from sklearn.preprocessing import binarize
from sklearn.decomposition import TruncatedSVD

Matplotlib created a temporary config/cache directory at /tmp/pbs.4263223.pbsha.ib.sockeye/matplotlib-246i220g because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


## Load the data

In [2]:
def load_data_as_anndata(filepaths, metadata_path):
    """
    Loads the files in <filepaths> as AnnData objects

    Source: https://github.com/openproblems-bio/neurips_2022_saturn_notebooks/blob/main/notebooks/loading_and_visualizing_all_data.ipynb
    """
    metadata_df = pd.read_csv(metadata_path)
    metadata_df = metadata_df.set_index("cell_id")

    adatas = {}
    chunk_size = 10000
    for name, filepath in filepaths.items():
        filename = basename(filepath)[:-3]
        logging.info(f"Loading {filename}")

        h5_file = h5py.File(filepath)
        h5_data = h5_file[filename]

        features = h5_data["axis0"][:]
        cell_ids = h5_data["axis1"][:]

        features = features.astype(str)
        cell_ids = cell_ids.astype(str)

        technology = metadata_df.loc[cell_ids, "technology"].unique().item()

        sparse_chunks = []
        n_cells = h5_data["block0_values"].shape[0]

        for chunk_indices in np.array_split(np.arange(n_cells), 100):
            chunk = h5_data["block0_values"][chunk_indices]
            sparse_chunk = scipy.sparse.csr_matrix(chunk)
            sparse_chunks.append(sparse_chunk)

        X = scipy.sparse.vstack(sparse_chunks)

        adata = ad.AnnData(
            X=X,
            obs=metadata_df.loc[cell_ids],
            var=pd.DataFrame(index=features),
        )

        adatas[name] = adata

    return adatas

In [3]:
config = yaml.safe_load(Path('/scratch/st-jiaruid-1/yinian/my_jupyter/scRNA-competition/experiments/basic-nn-cite.yaml').read_text())
adatas = load_data_as_anndata(config['paths'], config['metadata'])

In [4]:
x_train = adatas['x']
x_test = adatas['x_test']
y_train = adatas['y']
combined_data = ad.concat([x_train, x_test])

## Generate PCA embeddings of dimension 140

In [7]:
def pca_data(data, dimension):
    pca = TruncatedSVD(n_components=dimension, random_state=42)
    transformed = pca.fit_transform(data.X)
    new_data = ad.AnnData(transformed, data.obs, data.uns)
    return new_data

In [8]:
pca_combined_data = pca_data(combined_data, 140)

In [23]:
cell_type_proportions = {}
for cell_type in set(pca_combined_data.obs['cell_type']):
    cell_type_proportions[cell_type] = sum(pca_combined_data.obs['cell_type'] == cell_type) / pca_combined_data.shape[0]

## Generate input data

In [41]:
def separate_data(data):
    cell_day_dic = {}
    for cell_type in set(data.obs['cell_type']):
        for day in set(data.obs['day']):
            cell_day_data = data[np.logical_and(data.obs['day'] == day, data.obs['cell_type'] == cell_type)]
            if cell_day_data.shape[0] == 0:
                continue
            cell_day_dic[(cell_type, day)] = cell_day_data.obs_names
    return cell_day_dic

In [73]:
def generate_sequence(pca_combined_data, cell_type, indices):
    seq = []
    for day in (2, 3, 4):
        day_indices = indices[(cell_type, day)]
        seq.append(np.random.choice(day_indices))
    return pca_combined_data[seq, :].X.toarray()

In [82]:
def generate_train_data(pca_combined_data, y_train, cell_type_proportions, num_samples=100_000):
    cell_types = list(cell_type_proportions.keys())
    cell_type_probs = list(cell_type_proportions.values())
    indices = separate_data(y_train)
    data = []
    for i in range(num_samples):
        cell_type = np.random.choice(cell_types, p=cell_type_probs)
        data.append(generate_sequence(pca_combined_data, cell_type, indices))
    return np.stack(data, axis=0)

In [83]:
generated_train_data = generate_train_data(pca_combined_data, y_train, cell_type_proportions, 25000)