# Process Reichart et al.

Rather rich metadata

## Set up Env

In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix

In [2]:
import liana as li

  dot = np.dot(x * w, y)


In [3]:
from prep_utils import filter_samples, filter_celltypes, map_gene_symbols

In [11]:
dataset = 'reichart'
groupby = 'celltype'
sample_key = 'Sample'
condition_key = 'disease'
batch_key = 'Sample_Source'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 20 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type

## Preprocess

### Load data

In [19]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}.h5ad"), backed='r')
adata

AnnData object with n_obs × n_vars = 881081 × 33234 backed at 'data/reichart.h5ad'
    obs: 'Sample', 'donor_id', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes', 'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z', 'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned', 'self_reported_ethnicity_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'sex_ontology_term_id', 'assay_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'cell_states_colors', 'cell_type_colors', 'cell_type_ontology_term_id_colors', 'leiden', 'neighbors', 'pca', 'schema_version', 'title', 'umap'
    obsm: 'X_pca', 'X_umap'

In [20]:
adata.obs[['Sample', 'donor_id', 'Region_x', 'disease', 'sex', 'self_reported_ethnicity', 'development_stage']].nunique()

Sample                     196
donor_id                    79
Region_x                     2
disease                      4
sex                          2
self_reported_ethnicity      2
development_stage           10
dtype: int64

In [14]:
adata.obs[['Sample', 'donor_id', 'Region_x', 'disease', 'sex', 'self_reported_ethnicity', 'development_stage']].drop_duplicates()

Unnamed: 0,Sample,donor_id,Region_x,disease,sex,self_reported_ethnicity,development_stage
0,BS_DP2_RV0_premrna,DP2,RV,dilated cardiomyopathy,female,European,fifth decade human stage
2428,BS_H25_S00_premrna,H3,LV,normal,male,Asian,sixth decade human stage
7466,ED_DT4_LV0_premrna,DT4,LV,dilated cardiomyopathy,male,European,seventh decade human stage
19122,BS_H15_RV0_premrna,H5,RV,normal,female,European,sixth decade human stage
26042,BS_DP2_S00_premrna,DP2,LV,dilated cardiomyopathy,female,European,fifth decade human stage
...,...,...,...,...,...,...,...
868756,IC_H01_LV0_premrna,IC_H01,LV,dilated cardiomyopathy,male,European,fifth decade human stage
871234,IC_H02_LV0_premrna,IC_H02,LV,dilated cardiomyopathy,male,European,adolescent stage
877014,IC_H03_LV0_premrna,IC_H03,LV,dilated cardiomyopathy,male,European,fourth decade human stage
878080,IC_H04_LV0_premrna,IC_H04,LV,dilated cardiomyopathy,female,European,seventh decade human stage


In [25]:
adata.obs[['Sample', 'disease']].drop_duplicates().groupby('disease').count()

Unnamed: 0_level_0,Sample
disease,Unnamed: 1_level_1
non-compaction cardiomyopathy,3
dilated cardiomyopathy,107
arrhythmogenic right ventricular cardiomyopathy,22
normal,64


#### Filter samples acc to QC

In [None]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

### Randomly pick samples from the condition

In [17]:
adata.obs[[condition_key, sample_key]].drop_duplicates().groupby(adata.obs[condition_key]).count()

Unnamed: 0_level_0,disease,Sample
disease,Unnamed: 1_level_1,Unnamed: 2_level_1
non-compaction cardiomyopathy,3,3
dilated cardiomyopathy,107,107
arrhythmogenic right ventricular cardiomyopathy,22,22
normal,64,64


In [18]:
# randomly select samples equal to the number of samples in the smallest group
min_group_n = adata.obs[[condition_key, sample_key]].drop_duplicates().groupby(adata.obs[condition_key]).count().min()[0]
# pick ILD samples equal to min_group_n
ild_samples = adata.obs[[condition_key, sample_key]].drop_duplicates().query(f"{condition_key} == 'ILD'")[sample_key].sample(min_group_n, random_state=1337).values
ild_samples

ValueError: a must be greater than 0 unless no samples are taken

In [None]:
ctrl_samples = adata.obs[[condition_key, sample_key]].drop_duplicates().query(f"{condition_key} == 'Control'")[sample_key].sample(min_group_n, random_state=1337).values
ctrl_samples

#### Filter Samples & Read to Memory

In [None]:
adata = adata.to_memory()[adata.obs[sample_key].isin(np.union1d(ild_samples, ctrl_samples))]

In [None]:
adata.obs[[condition_key, sample_key]].drop_duplicates().groupby(adata.obs[condition_key]).count()

##### Filter cell types, and genes

In [None]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [None]:
# Remove genes expressed in few cells
sc.pp.filter_genes(adata, min_cells=30)

### Normalize

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

## Run LIANA

In [None]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

In [None]:
dataset

### Add Metadata & Write

In [None]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key

In [None]:
assert np.isin(['sample_key', 'batch_key', 'condition_key'], adata.uns_keys()).all()

In [None]:
adata.write_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"))

## Classify

In [None]:
import scanpy as sc
from classify_utils import classifier_pipe



In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
adata = sc.read_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"), backed='r')

In [None]:
classifier_pipe(adata, dataset, use_gpu=False) # Not enough memory on local

Creating views with: lr_means


100%|██████████| 636/636 [01:15<00:00,  8.48it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='Lymphatic Endothelial Cells&cDCs' group='group1' with N=20 samples and D=230 features...
Loaded view='Lymphatic Endothelial Cells&Macrophages' group='group1' with N=20 samples and D=270 features...
Loaded view='Lymphatic Endothelial Cells&Mast Cells' group='group1' with N=20 samples and D=96 features...
Loaded view='NK Cells&NK Cells' group='group1' with N=20 sample

100%|██████████| 20/20 [06:01<00:00, 18.08s/it]


Running Tensor Factorization
Creating views with: expr_prod


100%|██████████| 636/636 [01:11<00:00,  8.86it/s]


In [None]:
adata.uns['auc']