In [1]:
import os
os.environ[ 'NUMBA_CACHE_DIR' ] = '/scratch/st-jiaruid-1/yinian/tmp/' # https://github.com/scverse/scanpy/issues/2113

import logging
import anndata as ad
import pickle
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

import h5py
import hdf5plugin
import tables

from sklearn.preprocessing import binarize
from sklearn.decomposition import TruncatedSVD

Matplotlib created a temporary config/cache directory at /tmp/pbs.4256786.pbsha.ib.sockeye/matplotlib-6hpjxl09 because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


### Preprocessing hyperparameters

In [2]:
CITE_TOP_GENES = 250
MULTI_TOP_GENES = 1000
CITE_STACK = 10
MULTI_STACK = 50

### Load the CITE data

In [3]:
DATA_DIR = "/arc/project/st-jiaruid-1/yinian/multiome/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

In [4]:
metadata_df = pd.read_csv(FP_CELL_METADATA)
metadata_df = metadata_df.set_index('cell_id')

In [8]:
base_dir = '/arc/project/st-jiaruid-1/yinian/multiome/'
filenames = [
    'test_cite_inputs', 
#     'test_multi_inputs', 
    'train_cite_inputs',
    'train_cite_targets',
#     'train_multi_inputs',
#     'train_multi_targets',
]

In [7]:
adatas = {}
chunk_size = 10000

for filename in filenames:
    print(f'loading {filename}.h5')
    filepath = base_dir + filename + '.h5'
    
    h5_file = h5py.File(filepath)
    h5_data = h5_file[filename]
    
    features = h5_data['axis0'][:]
    cell_ids = h5_data['axis1'][:]
    
    features = features.astype(str)
    cell_ids = cell_ids.astype(str)
    
    technology = metadata_df.loc[cell_ids, 'technology'].unique().item()
    

    sparse_chunks = []
    n_cells = h5_data['block0_values'].shape[0]

    for chunk_indices in np.array_split(np.arange(n_cells), 100):
        chunk = h5_data['block0_values'][chunk_indices]
        sparse_chunk = scipy.sparse.csr_matrix(chunk)
        sparse_chunks.append(sparse_chunk)

    X = scipy.sparse.vstack(sparse_chunks)

    adata = ad.AnnData(
        X=X,
        obs=metadata_df.loc[cell_ids],
        var=pd.DataFrame(index=features),
    )
    
    adatas[filename] = adata

loading test_cite_inputs.h5
loading train_cite_inputs.h5
loading train_cite_targets.h5


### CITE top genes

In [8]:
def gex_de_analysis(adata_GEX, top_genes):
    '''get top DE genes per cell type (multiome)'''
#     adata_GEX = sc.read_h5ad(path)
#     adata_GEX.X = adata_GEX.layers['counts']
#     sc.pp.normalize_per_cell(adata_GEX, counts_per_cell_after=1e6)
#     sc.pp.log1p(adata_GEX)
    sc.pp.filter_cells(adata_GEX, min_genes=200)
    sc.pp.filter_genes(adata_GEX, min_cells=3)
    adata_GEX.var['mt'] = adata_GEX.var_names.str.contains('MT-') 
    sc.pp.calculate_qc_metrics(adata_GEX, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata_GEX = adata_GEX[adata_GEX.obs.n_genes_by_counts < 4000, :]
    sc.pp.normalize_total(adata_GEX, target_sum=1e4)
    sc.pp.log1p(adata_GEX)
    sc.pp.highly_variable_genes(adata_GEX, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pp.scale(adata_GEX, max_value=10)
    sc.tl.rank_genes_groups(adata_GEX, 'cell_type', method='wilcoxon')
    cell_types = adata_GEX.obs.cell_type.value_counts().index
    column_names = ['names', 'scores', 'logfoldchanges', 'pvals', 'pvals_adj', 'cell_type']
    df = pd.DataFrame(columns = column_names)
    for cell_type in cell_types:
        dedf = sc.get.rank_genes_groups_df(adata_GEX, group=cell_type)
        dedf['cell_type'] = cell_type
        dedf = dedf.sort_values('scores', ascending=False).iloc[:top_genes]
        df = df.append(dedf, ignore_index=True)
    return df

In [9]:
important_cols = ['ENSG00000114013_CD86', 'ENSG00000120217_CD274', 'ENSG00000196776_CD47', 'ENSG00000117091_CD48', 'ENSG00000101017_CD40', 'ENSG00000102245_CD40LG', 'ENSG00000169442_CD52', 'ENSG00000117528_ABCD3', 'ENSG00000168014_C2CD3', 'ENSG00000167851_CD300A', 'ENSG00000167850_CD300C', 'ENSG00000186407_CD300E', 'ENSG00000178789_CD300LB', 'ENSG00000186074_CD300LF', 'ENSG00000241399_CD302', 'ENSG00000167775_CD320', 'ENSG00000105383_CD33', 'ENSG00000174059_CD34', 'ENSG00000135218_CD36', 'ENSG00000104894_CD37', 'ENSG00000004468_CD38', 'ENSG00000167286_CD3D', 'ENSG00000198851_CD3E', 'ENSG00000117877_CD3EAP', 'ENSG00000074696_HACD3', 'ENSG00000015676_NUDCD3', 'ENSG00000161714_PLCD3', 'ENSG00000132300_PTCD3', 'ENSG00000082014_SMARCD3', 'ENSG00000121594_CD80', 'ENSG00000110651_CD81', 'ENSG00000238184_CD81-AS1', 'ENSG00000085117_CD82', 'ENSG00000112149_CD83', 'ENSG00000066294_CD84', 'ENSG00000114013_CD86', 'ENSG00000172116_CD8B', 'ENSG00000254126_CD8B2', 'ENSG00000177455_CD19', 'ENSG00000105383_CD33', 'ENSG00000173762_CD7', 'ENSG00000125726_CD70', 'ENSG00000137101_CD72', 'ENSG00000019582_CD74', 'ENSG00000105369_CD79A', 'ENSG00000007312_CD79B', 'ENSG00000090470_PDCD7', 'ENSG00000119688_ABCD4', 'ENSG00000010610_CD4', 'ENSG00000101017_CD40', 'ENSG00000102245_CD40LG', 'ENSG00000026508_CD44', 'ENSG00000117335_CD46', 'ENSG00000196776_CD47', 'ENSG00000117091_CD48', 'ENSG00000188921_HACD4', 'ENSG00000150593_PDCD4', 'ENSG00000203497_PDCD4-AS1', 'ENSG00000115556_PLCD4', 'ENSG00000026508_CD44', 'ENSG00000170458_CD14', 'ENSG00000117281_CD160', 'ENSG00000177575_CD163', 'ENSG00000135535_CD164', 'ENSG00000091972_CD200', 'ENSG00000163606_CD200R1', 'ENSG00000206531_CD200R1L', 'ENSG00000182685_BRICD5', 'ENSG00000111731_C2CD5', 'ENSG00000169442_CD52', 'ENSG00000143119_CD53', 'ENSG00000196352_CD55', 'ENSG00000116815_CD58', 'ENSG00000085063_CD59', 'ENSG00000105185_PDCD5', 'ENSG00000255909_PDCD5P1', 'ENSG00000145284_SCD5', 'ENSG00000167775_CD320', 'ENSG00000110848_CD69', 'ENSG00000139187_KLRG1', 'ENSG00000139193_CD27', 'ENSG00000215039_CD27-AS1', 'ENSG00000120217_CD274', 'ENSG00000103855_CD276', 'ENSG00000204287_HLA-DRA', 'ENSG00000196126_HLA-DRB1', 'ENSG00000198502_HLA-DRB5', 'ENSG00000229391_HLA-DRB6', 'ENSG00000116815_CD58', 'ENSG00000168329_CX3CR1', 'ENSG00000272398_CD24', 'ENSG00000122223_CD244', 'ENSG00000198821_CD247', 'ENSG00000122223_CD244', 'ENSG00000177575_CD163', 'ENSG00000112149_CD83', 'ENSG00000185963_BICD2', 'ENSG00000157617_C2CD2', 'ENSG00000172375_C2CD2L', 'ENSG00000116824_CD2', 'ENSG00000091972_CD200', 'ENSG00000163606_CD200R1', 'ENSG00000206531_CD200R1L', 'ENSG00000012124_CD22', 'ENSG00000150637_CD226', 'ENSG00000272398_CD24', 'ENSG00000122223_CD244', 'ENSG00000198821_CD247', 'ENSG00000139193_CD27', 'ENSG00000215039_CD27-AS1', 'ENSG00000120217_CD274', 'ENSG00000103855_CD276', 'ENSG00000198087_CD2AP', 'ENSG00000169217_CD2BP2', 'ENSG00000144554_FANCD2', 'ENSG00000206527_HACD2', 'ENSG00000170584_NUDCD2', 'ENSG00000071994_PDCD2', 'ENSG00000126249_PDCD2L', 'ENSG00000049883_PTCD2', 'ENSG00000186193_SAPCD2', 'ENSG00000108604_SMARCD2', 'ENSG00000185561_TLCD2', 'ENSG00000075035_WSCD2', 'ENSG00000150637_CD226', 'ENSG00000110651_CD81', 'ENSG00000238184_CD81-AS1', 'ENSG00000134061_CD180', 'ENSG00000004468_CD38', 'ENSG00000012124_CD22', 'ENSG00000150637_CD226', 'ENSG00000135404_CD63', 'ENSG00000135218_CD36', 'ENSG00000137101_CD72', 'ENSG00000125810_CD93', 'ENSG00000010278_CD9', 'ENSG00000125810_CD93', 'ENSG00000153283_CD96', 'ENSG00000002586_CD99', 'ENSG00000102181_CD99L2', 'ENSG00000223773_CD99P1', 'ENSG00000204592_HLA-E', 'ENSG00000085117_CD82', 'ENSG00000134256_CD101']
important_cols = set(important_cols)

In [39]:
x_train = adatas['train_cite_inputs']
x_test = adatas['train_cite_inputs']

In [40]:
genes = gex_de_analysis(x_train.copy(), CITE_TOP_GENES)
selected_genes = set(genes.names).union(important_cols)

  view_to_actual(adata)
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)


In [41]:
subset = selected_genes.intersection(x_train.var_names)
x_train = x_train[:, list(subset)]
x_test = x_test[:, list(subset)]

### CITE stack summary stats

In [43]:
train_total = np.sum(x_train.X.toarray(), axis=1)
test_total = np.sum(x_test.X.toarray(), axis=1)

train_batches = set(x_train.obs.donor)
x_train.obs["batch_median"] = 0
x_train.obs["batch_sd"] = 0
for batch in train_batches:
    x_train.obs["batch_median"][x_train.obs.donor == batch] = np.median(
        train_total[x_train.obs.donor == batch]
    )
    x_train.obs["batch_sd"][x_train.obs.donor == batch] = np.std(
        train_total[x_train.obs.donor == batch]
    )

test_batches = set(x_test.obs.donor)
x_test.obs["batch_median"] = 0
x_test.obs["batch_sd"] = 0

for batch in test_batches:
    x_test.obs["batch_median"][x_test.obs.donor == batch] = np.median(
        test_total[x_test.obs.donor == batch]
    )
    x_test.obs["batch_sd"][x_test.obs.donor == batch] = np.std(
        test_total[x_test.obs.donor == batch]
    )


  x_train.obs["batch_median"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train.obs["batch_median"][x_train.obs.donor == batch] = np.median(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train.obs["batch_sd"][x_train.obs.donor == batch] = np.std(
  x_test.obs["batch_median"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test.obs["batch_median"][x_test.obs.donor == batch] = np.median(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https:

### Stack the summary stats on

In [44]:
train_batch_median = x_train.obs["batch_median"]
train_batch_sd = x_train.obs["batch_sd"]
test_batch_median = x_test.obs["batch_median"]
test_batch_sd = x_test.obs["batch_sd"]

In [45]:
x_train = x_train.X.toarray()
x_test = x_test.X.toarray()

In [46]:
for i in range(CITE_STACK):
    x_train = np.column_stack((x_train, train_total))
for i in range(CITE_STACK):
    x_train = np.column_stack((x_train, train_batch_median))
for i in range(CITE_STACK):
    x_train = np.column_stack((x_train, train_batch_sd))

In [47]:
for i in range(CITE_STACK):
    x_test = np.column_stack((x_test, test_total))
for i in range(CITE_STACK):
    x_test = np.column_stack((x_test, test_batch_median))
for i in range(CITE_STACK):
    x_test = np.column_stack((x_test, test_batch_sd))

### Normalize?

In [48]:
x_train = x_train.T
means = np.mean(x_train, axis = 1)
sds = np.std(x_train, axis = 1)
means = means.reshape(len(means), 1)
sds = sds.reshape(len(sds), 1)
info = {"means":means,"sds":sds}

In [49]:
x_train = (x_train - means) / sds
x_train = x_train.T

x_test = x_test.T
x_test = (x_test - info["means"]) / info["sds"]
x_test = x_test.T

### Dump the pickles

In [52]:
cite_train_filename = '/scratch/st-jiaruid-1/yinian/temp/top-genes-cite-train.pkl'
cite_test_filename = '/scratch/st-jiaruid-1/yinian/temp/top-genes-cite-test.pkl'
with open(cite_train_filename, 'wb') as f:
    pickle.dump(x_train, f)
with open(cite_test_filename, 'wb') as f:
    pickle.dump(x_test, f)

### Repeat for Multiome

In [10]:
base_dir = '/arc/project/st-jiaruid-1/yinian/multiome/'
filenames = [
#     'test_cite_inputs', 
    'test_multi_inputs', 
#     'train_cite_inputs',
#     'train_cite_targets',
    'train_multi_inputs',
    'train_multi_targets',
]

In [11]:
adatas = {}
chunk_size = 10000

for filename in filenames:
    print(f'loading {filename}.h5')
    filepath = base_dir + filename + '.h5'
    
    h5_file = h5py.File(filepath)
    h5_data = h5_file[filename]
    
    features = h5_data['axis0'][:]
    cell_ids = h5_data['axis1'][:]
    
    features = features.astype(str)
    cell_ids = cell_ids.astype(str)
    
    technology = metadata_df.loc[cell_ids, 'technology'].unique().item()
    

    sparse_chunks = []
    n_cells = h5_data['block0_values'].shape[0]

    for chunk_indices in np.array_split(np.arange(n_cells), 100):
        chunk = h5_data['block0_values'][chunk_indices]
        sparse_chunk = scipy.sparse.csr_matrix(chunk)
        sparse_chunks.append(sparse_chunk)

    X = scipy.sparse.vstack(sparse_chunks)

    adata = ad.AnnData(
        X=X,
        obs=metadata_df.loc[cell_ids],
        var=pd.DataFrame(index=features),
    )
    
    adatas[filename] = adata

loading test_multi_inputs.h5
loading train_multi_inputs.h5
loading train_multi_targets.h5


In [6]:
def atac_de_analysis(adata, top_genes):
    '''get top DA peaks per cell type'''
    adata.X = binarize(adata.X)
    sc.tl.rank_genes_groups(adata, 'cell_type', method='t-test')
    cell_types = adata.obs.cell_type.value_counts().index
    column_names = ['names', 'scores', 'logfoldchanges', 'pvals', 'pvals_adj', 'cell_type']
    df = pd.DataFrame(columns = column_names)
    for cell_type in cell_types:
        dedf = sc.get.rank_genes_groups_df(adata, group=cell_type)
        dedf['cell_type'] = cell_type
        dedf = dedf.sort_values('scores', ascending=False).iloc[:top_genes]
        df = df.append(dedf, ignore_index=True)
    return df

In [19]:
x_train = adatas['train_multi_inputs']
x_test = adatas['train_multi_inputs']

In [20]:
genes = atac_de_analysis(x_train.copy(), MULTI_TOP_GENES)
selected_genes = set(genes.names)



  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)
  df = df.append(dedf, ignore_index=True)


In [21]:
subset = selected_genes.intersection(x_train.var_names)
x_train = x_train[:, list(subset)]
x_test = x_test[:, list(subset)]

In [22]:
train_total = np.sum(x_train.X.toarray(), axis=1)
test_total = np.sum(x_test.X.toarray(), axis=1)

train_batches = set(x_train.obs.donor)
x_train.obs["batch_median"] = 0
x_train.obs["batch_sd"] = 0
for batch in train_batches:
    x_train.obs["batch_median"][x_train.obs.donor == batch] = np.median(
        train_total[x_train.obs.donor == batch]
    )
    x_train.obs["batch_sd"][x_train.obs.donor == batch] = np.std(
        train_total[x_train.obs.donor == batch]
    )

test_batches = set(x_test.obs.donor)
x_test.obs["batch_median"] = 0
x_test.obs["batch_sd"] = 0

for batch in test_batches:
    x_test.obs["batch_median"][x_test.obs.donor == batch] = np.median(
        test_total[x_test.obs.donor == batch]
    )
    x_test.obs["batch_sd"][x_test.obs.donor == batch] = np.std(
        test_total[x_test.obs.donor == batch]
    )


  x_train.obs["batch_median"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train.obs["batch_median"][x_train.obs.donor == batch] = np.median(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train.obs["batch_sd"][x_train.obs.donor == batch] = np.std(
  x_test.obs["batch_median"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test.obs["batch_median"][x_test.obs.donor == batch] = np.median(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https:

In [23]:
train_batch_median = x_train.obs["batch_median"]
train_batch_sd = x_train.obs["batch_sd"]
test_batch_median = x_test.obs["batch_median"]
test_batch_sd = x_test.obs["batch_sd"]

In [24]:
x_train = x_train.X.toarray()
x_test = x_test.X.toarray()

In [25]:
for i in range(MULTI_STACK):
    x_train = np.column_stack((x_train, train_total))
for i in range(MULTI_STACK):
    x_train = np.column_stack((x_train, train_batch_median))
for i in range(MULTI_STACK):
    x_train = np.column_stack((x_train, train_batch_sd))

for i in range(MULTI_STACK):
    x_test = np.column_stack((x_test, test_total))
for i in range(MULTI_STACK):
    x_test = np.column_stack((x_test, test_batch_median))
for i in range(MULTI_STACK):
    x_test = np.column_stack((x_test, test_batch_sd))

In [26]:
x_train = x_train.T
means = np.mean(x_train, axis = 1)
sds = np.std(x_train, axis = 1)
means = means.reshape(len(means), 1)
sds = sds.reshape(len(sds), 1)
info = {"means":means,"sds":sds}

In [27]:
x_train = (x_train - means) / sds
x_train = x_train.T

x_test = x_test.T
x_test = (x_test - info["means"]) / info["sds"]
x_test = x_test.T

In [28]:
cite_train_filename = '/scratch/st-jiaruid-1/yinian/temp/top-genes-multi-train.pkl'
cite_test_filename = '/scratch/st-jiaruid-1/yinian/temp/top-genes-multi-test.pkl'
with open(cite_train_filename, 'wb') as f:
    pickle.dump(x_train, f)
with open(cite_test_filename, 'wb') as f:
    pickle.dump(x_test, f)