In [1]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.auto import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../..')
%load_ext autoreload
%autoreload 2
from utils import *

# scperturb package
sys.path.insert(1, '../../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../../figures/')

In [2]:
DATADIR = Path('/data/gpfs-1/users/peidlis_c/work/data/perturbation_resource_paper')
TEMPDIR = Path("/fast/scratch/users/peidlis_c/perturbation_resource_paper/")

In [3]:
from joblib import Parallel, delayed
def onesided_pca_distances_(adata, obs_key, control, obsm_key='X_pca',
                           dist='sqeuclidean', correction_factor=False, n_jobs=1, 
                           verbose=True):
    """Average of pairwise PCA distances between cells of each group in obs_key with control group.
    For each group defined in adata.obs[obs_key] (e.g. perturbations)
    computes all pairwise distances between cells in adata.obsm[obsm_key] (e.g. PCA space)
    and averages them per group-control-pair. This results in a distance vector with a value for each group.

    Arguments
    ---------
    adata: :class:`~anndata.AnnData`
        Annotated data matrix.
    obs_key: `str` in adata.obs.keys()
        Key in adata.obs specifying the groups to consider.
    control: `str` of a category in adata.obs[obs_key]
        Group in obs_key for control cells.
    obsm_key: `str` in adata.obsm (default: `adata.obsm['X_pca']`)
        Key for embedding coordinates to use.
    dist: `str` for any distance in scipy.spatial.distance (default: `sqeuclidean`)
        Distance metric to use in embedding space.
    correction_factor: `bool` (default: `False`)
        Whether make the estimator for sigma more unbiased (dividing by N-1 instead of N, similar to sample and population variance).
    verbose: `bool` (default: `True`)
        Whether to show a progress bar iterating over all groups.

    Returns
    -------
    pwd: pandas.DataFrame
        DataFrame with average PCA distances to control for all groups.
    """

    if obsm_key=='X_pca' and 'X_pca' not in adata.obsm.keys():
        warn('PCA embedding not found, computing...')
        sc.pp.pca(adata)

    groups = pd.unique(adata.obs[obs_key])
    assert control in groups, f'No cells of control group "{control}" were not found in groups defined by "{obs_key}".'
    #df = pd.DataFrame(index=groups, columns=['distance'], dtype=float)
    fct = tqdm if verbose else lambda x: x
    
    x1 = adata[adata.obs[obs_key]==control].obsm[obsm_key]
    N = len(x1)
    def one_step(group, x2):
        # x2 = adata[adata.obs[obs_key]==group].obsm[obsm_key]
        pwd = pairwise_distances(x1, x2, metric=dist)
        M = len(x2) if (group==control) & ~correction_factor else len(x2)-1
        factor = N * M  # Thanks to Garrett Wong for finding this bug
        mean_pwd = np.sum(pwd) / factor
        return group, mean_pwd
    res = Parallel(n_jobs=n_jobs)(delayed(one_step)(group, adata[adata.obs[obs_key]==group].obsm[obsm_key]) for group in fct(groups))
    df = pd.DataFrame(res, columns=[obs_key, 'distance'], dtype=float).set_index(obs_key) 
    df.name = f'PCA distances to {control}'
    return df

In [4]:
from scperturb import edist, onesided_pca_distances, etest, self_pca_distances

dataset = 'PapalexiSatija2021_eccite_RNA'
dataset = 'ReplogleWeissman2022_K562_gwps'

adata = sc.read(TEMPDIR / f'tmp_data_{dataset}.h5')

In [28]:
res_0 = onesided_pca_distances(adata, 'perturbation', 'control')

100%|██████████| 4097/4097 [00:27<00:00, 149.34it/s]


In [29]:
res_1 = onesided_pca_distances_(adata, 'perturbation', 'control', n_jobs=1)

100%|██████████| 4097/4097 [00:26<00:00, 151.77it/s]


In [5]:
res_4 = onesided_pca_distances_(adata, 'perturbation', 'control', n_jobs=4)

100%|██████████| 4097/4097 [00:39<00:00, 105.02it/s]


In [6]:
from tqdm.auto import tqdm
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import pairwise_distances

In [None]:
for i in fct(range(runs)):
    # per perturbation, shuffle with control and compute e-distance
    df = pd.DataFrame(index=groups, columns=['edist'], dtype=float)
    for group in groups:
        if group==control:
            df.loc[group] = [0]
            continue
        N = np.sum(adata.obs[obs_key]==group)
        # shuffle the labels
        labels = adata.obs[obs_key].values[adata.obs[obs_key].isin([group, control])]
        shuffled_labels = np.random.permutation(labels)

        # use precomputed pairwise distances
        sc_pwd = pwds[group]  # precomputed pairwise distances between single cells
        idx = shuffled_labels==group

        # Note that this is wrong: sc_pwd[idx, ~idx] but this is correct: sc_pwd[idx, :][:, ~idx]
        # The first produces a vector, the second a matrix (we need the matrix)
        factor = N / (N-1) if flavor==1 else 1
        factor_c = M / (M-1) if flavor==1 else 1
        delta = np.sum(sc_pwd[idx, :][:, ~idx]) / (N * M)
        sigma = np.sum(sc_pwd[idx, :][:, idx]) / (N * N) * factor
        sigma_c = np.sum(sc_pwd[~idx, :][:, ~idx]) / (M * M) * factor_c

        edistance = 2 * delta - sigma - sigma_c

        df.loc[group] = edistance
    res.append(df.sort_index())

In [9]:
def one_step():
    # per perturbation, shuffle with control and compute e-distance
    df = pd.DataFrame(index=groups, columns=['edist'], dtype=float)
    for group in groups:
        if group==control:
            df.loc[group] = [0]
            continue
        N = np.sum(adata.obs[obs_key]==group)
        # shuffle the labels
        labels = adata.obs[obs_key].values[adata.obs[obs_key].isin([group, control])]
        shuffled_labels = np.random.permutation(labels)

        # use precomputed pairwise distances
        sc_pwd = pwds[group]  # precomputed pairwise distances between single cells
        idx = shuffled_labels==group

        # Note that this is wrong: sc_pwd[idx, ~idx] but this is correct: sc_pwd[idx, :][:, ~idx]
        # The first produces a vector, the second a matrix (we need the matrix)
        factor = N / (N-1) if flavor==1 else 1
        factor_c = M / (M-1) if flavor==1 else 1
        delta = np.sum(sc_pwd[idx, :][:, ~idx]) / (N * M)
        sigma = np.sum(sc_pwd[idx, :][:, idx]) / (N * N) * factor
        sigma_c = np.sum(sc_pwd[~idx, :][:, ~idx]) / (M * M) * factor_c

        edistance = 2 * delta - sigma - sigma_c

        df.loc[group] = edistance
    return df.sort_index()

In [None]:
from joblib import Parallel, delayed
results = Parallel(n_jobs=2)(delayed(one_step() for i in range(10))
print(results)