# Process Velmeshev et al.

## Set up Env

In [2]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix

In [3]:
import liana as li

In [4]:
from prep_utils import filter_samples, filter_celltypes

In [5]:
dataset = 'velmeshev'
groupby = 'cluster'
sample_key = 'sample'
condition_key = 'diagnosis'
batch_key = 'sex'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 10 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type

## Preprocess

### Load data

In [None]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}.h5ad"))
adata

In [None]:
adata.obs[['sample', 'individual', 'diagnosis']].drop_duplicates().nunique()

In [None]:
## subsample
adata = sc.pp.subsample(adata, fraction=0.1, copy=True)

### Convert to Genesymbols

In [None]:
def map_gene_symbols(adata, map_df):
    """Maps gene symbols from aliases to standard symbols
    Genes that map many-to-one are summed.
    Genes that map one-to-many are duplicated.
    Parameters
    ----------
    adata : anndata.AnnData
    map_df :
        dataframe containing gene symbol map with two columns, `gene` and `alias`
    Returns
    -------
    adata : anndata.AnnData
    """
    
    import collections
    import anndata
    import scipy
    
    var = adata.var.rename_axis("alias", axis=0)[[]]
    gene_match_idx = np.isin(var.index, map_df["gene"])
    var_gene_match, var = var.loc[gene_match_idx].copy(), var.loc[~gene_match_idx]
    alias_match_idx = np.isin(var.index, map_df["alias"])
    var_alias_match, var_no_map = (
        var.loc[alias_match_idx].copy(),
        var.loc[~alias_match_idx].copy(),
    )

    # fill 'gene' column
    var_alias_match = var_alias_match.reset_index().merge(
        map_df, on="alias", how="left"
    )
    var_gene_match["gene"] = var_gene_match.index
    var_no_map["gene"] = var_no_map.index

    var_dealiased = pd.concat(
        [var_gene_match.reset_index(), var_no_map.reset_index(), var_alias_match]
    )
    duplicate_idx = var_dealiased["gene"].duplicated(keep=False)
    var_dealiased_many_to_one, var_dealiased_one_to_any = (
        var_dealiased.loc[duplicate_idx],
        var_dealiased.loc[~duplicate_idx],
    )

    adata_one_to_any = adata[:, var_dealiased_one_to_any["alias"]]
    adata_one_to_any.var.index = var_dealiased_one_to_any["gene"]

    many_to_one_genes = var_dealiased_many_to_one["gene"].unique()
    many_to_one_X = []
    many_to_one_layers = collections.defaultdict(list)
    for gene in var_dealiased_many_to_one["gene"].unique():
        gene_aliases = var_dealiased_many_to_one.loc[
            var_dealiased_many_to_one["gene"] == gene, "alias"
        ]
        adata_gene = adata[:, gene_aliases]
        many_to_one_X.append(scipy.sparse.coo_matrix(adata_gene.X.sum(axis=1)))
        for layer_name, layer in adata_gene.layers.items():
            many_to_one_layers[layer_name].append(
                scipy.sparse.coo_matrix(adata_gene.X.sum(axis=1))
            )

    return anndata.AnnData(
        X=scipy.sparse.hstack([adata_one_to_any.X] + many_to_one_X).tocsr(),
        obs=adata.obs,
        var=pd.DataFrame(
            index=np.concatenate([adata_one_to_any.var.index, many_to_one_genes])
        ),
        layers={
            layer_name: scipy.sparse.hstack(
                [adata_one_to_any.layers[layer_name]] + many_to_one_layers[layer_name]
            ).tocsr()
            for layer_name in adata.layers
        },
        uns=adata.uns,
        obsm=adata.obsm,
    )

In [None]:
def map_gene_symbols(adata, map_df):
    """Maps gene symbols from aliases to standard symbols
    Genes that map many-to-one are summed.
    Genes that map one-to-many are duplicated.
    Parameters
    ----------
    adata : anndata.AnnData
    map_df :
        dataframe containing gene symbol map with two columns, `gene` and `alias`
    Returns
    -------
    adata : anndata.AnnData
    """
    
    import collections
    import anndata
    import scipy
    
    var = adata.var.rename_axis("alias", axis=0)[[]]
    gene_match_idx = np.isin(var.index, map_df["gene"])
    var_gene_match, var = var.loc[gene_match_idx].copy(), var.loc[~gene_match_idx]
    alias_match_idx = np.isin(var.index, map_df["alias"])
    var_alias_match, var_no_map = (
        var.loc[alias_match_idx].copy(),
        var.loc[~alias_match_idx].copy(),
    )

    # fill 'gene' column
    var_alias_match = var_alias_match.reset_index().merge(
        map_df, on="alias", how="left"
    )
    var_gene_match["gene"] = var_gene_match.index
    var_no_map["gene"] = var_no_map.index

    var_dealiased = pd.concat(
        [var_gene_match.reset_index(), var_no_map.reset_index(), var_alias_match]
    )
    duplicate_idx = var_dealiased["gene"].duplicated(keep=False)
    var_dealiased_many_to_one, var_dealiased_one_to_any = (
        var_dealiased.loc[duplicate_idx],
        var_dealiased.loc[~duplicate_idx],
    )

    adata_one_to_any = adata[:, var_dealiased_one_to_any["alias"]]
    adata_one_to_any.var.index = var_dealiased_one_to_any["gene"]

    many_to_one_genes = var_dealiased_many_to_one["gene"].unique()
    many_to_one_X = []
    many_to_one_layers = collections.defaultdict(list)
    for gene in var_dealiased_many_to_one["gene"].unique():
        gene_aliases = var_dealiased_many_to_one.loc[
            var_dealiased_many_to_one["gene"] == gene, "alias"
        ]
        adata_gene = adata[:, gene_aliases]
        many_to_one_X.append(scipy.sparse.coo_matrix(adata_gene.X.sum(axis=1)))
        for layer_name, layer in adata_gene.layers.items():
            many_to_one_layers[layer_name].append(
                scipy.sparse.coo_matrix(adata_gene.X.sum(axis=1))
            )

    return anndata.AnnData(
        X=scipy.sparse.hstack([adata_one_to_any.X] + many_to_one_X).tocsr(),
        obs=adata.obs,
        var=pd.DataFrame(
            index=np.concatenate([adata_one_to_any.var.index, many_to_one_genes])
        ),
        layers={
            layer_name: scipy.sparse.hstack(
                [adata_one_to_any.layers[layer_name]] + many_to_one_layers[layer_name]
            ).tocsr()
            for layer_name in adata.layers
        },
        uns=adata.uns,
        obsm=adata.obsm,
    )

In [None]:
df = adata.var.reset_index()['index'].str.split('\\|', expand=True).rename(columns={0:'ensembl', 1:'genesymbol'})
adata.var = df.set_index('ensembl')
map_df = df.rename(columns={'ensembl':'alias', 'genesymbol':'gene'})
map_df

In [None]:
adata = map_gene_symbols(adata, map_df)

In [None]:
adata

Filter samples, cell types, and genes

In [None]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

In [None]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [None]:
# Remove genes expressed in few cells, normalize
sc.pp.filter_genes(adata, min_cells=30)

### Normalize

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

## Run LIANA

In [None]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

### Add Metadata & Write

In [None]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key

In [None]:
assert np.isin(['sample_key', 'batch_key', 'condition_key'], adata.uns_keys()).all()

In [None]:
adata.write_h5ad(os.path.join('data', f"{dataset}_processed.h5ad"))

## Classify

In [None]:
from classify_utils import classifier_pipe

In [None]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}_processed.h5ad"), backed='r')

In [None]:
classifier_pipe(adata, dataset)

In [None]:
adata.uns['auc']