# Process Velmeshev et al.

## Set up Env

In [2]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix

In [3]:
import liana as li

In [4]:
from prep_utils import filter_samples, filter_celltypes, map_gene_symbols

In [5]:
dataset = 'velmeshev'
groupby = 'cluster'
sample_key = 'sample'
condition_key = 'diagnosis'
batch_key = 'sex'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 10 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type

## Preprocess

### Load data

In [25]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}.h5ad"))
adata

AnnData object with n_obs × n_vars = 104559 × 36501
    obs: 'cluster', 'sample', 'individual', 'region', 'age', 'sex', 'diagnosis', 'Capbatch', 'Seqbatch', 'post-mortem interval (hours)', 'RNA Integrity Number', 'genes', 'UMIs', 'RNA mitochondr. percent', 'RNA ribosomal percent'

In [26]:
adata.obs[['sample', 'individual', 'diagnosis']].drop_duplicates().nunique()

sample        41
individual    31
diagnosis      2
dtype: int64

### Convert to Genesymbols

In [28]:
df = adata.var.reset_index()['index'].str.split('\\|', expand=True).rename(columns={0:'ensembl', 1:'genesymbol'})
adata.var = df.set_index('ensembl')
map_df = df.rename(columns={'ensembl':'alias', 'genesymbol':'gene'})
map_df

Unnamed: 0,alias,gene
0,ENSG00000227232,WASH7P
1,ENSG00000243485,RP11-34P13.3
2,ENSG00000238009,RP11-34P13.7
3,ENSG00000233750,CICP27
4,ENSG00000268903,RP11-34P13.15
...,...,...
36496,ENSG00000198786,MT-ND5
36497,ENSG00000198695,MT-ND6
36498,ENSG00000198727,MT-CYB
36499,ENSG00000210195,MT-TT


In [29]:
adata = map_gene_symbols(adata, map_df)

In [30]:
adata

AnnData object with n_obs × n_vars = 104559 × 36254
    obs: 'cluster', 'sample', 'individual', 'region', 'age', 'sex', 'diagnosis', 'Capbatch', 'Seqbatch', 'post-mortem interval (hours)', 'RNA Integrity Number', 'genes', 'UMIs', 'RNA mitochondr. percent', 'RNA ribosomal percent'

Filter samples, cell types, and genes

In [31]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

           sample
diagnosis        
ASD            22
Control        18


In [32]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [33]:
# Remove genes expressed in few cells, normalize
sc.pp.filter_genes(adata, min_cells=30)



### Normalize

In [34]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

## Run LIANA

In [36]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

Now running: 6033_BA24: 100%|██████████| 40/40 [15:34<00:00, 23.36s/it]


In [37]:
dataset

'velmeshev'

### Add Metadata & Write

In [38]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key

In [39]:
assert np.isin(['sample_key', 'batch_key', 'condition_key'], adata.uns_keys()).all()

In [40]:
adata.write_h5ad(os.path.join('data', f"{dataset}_processed.h5ad"))

## Classify

In [1]:
import scanpy as sc
from classify_utils import classifier_pipe

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}_processed.h5ad"), backed='r')

In [8]:
classifier_pipe(adata, dataset)

Creating views with: lr_means


100%|██████████| 289/289 [00:59<00:00,  4.84it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='Oligodendrocytes&Oligodendrocytes' group='group1' with N=40 samples and D=90 features...
Loaded view='Oligodendrocytes&Neu-mat' group='group1' with N=40 samples and D=118 features...
Loaded view='OPC&OPC' group='group1' with N=40 samples and D=228 features...
Loaded view='Neu-NRGN-II&Neu-NRGN-II' group='group1' with N=40 samples and D=89 features...
Loaded view='AST

 20%|██        | 8/40 [01:29<05:51, 11.00s/it]

In [None]:
adata.uns['auc']