In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib_venn import venn2
import seaborn as sns
import anndata

## Compute DAPs

In [None]:
true_ATAC_babel.obs['group'] = true_ATAC_babel.obs['Cell Types'].map(
    lambda x: 'B' if x in ['B'] else ('LYM' if x in ['Tumor B', 'Tumor B cycling'] else None)
)
true_atac_data = true_ATAC_babel[true_ATAC_babel.obs['group'].isin(['B','LYM'])].copy()

sc.pp.log1p(true_atac_data)

sc.tl.rank_genes_groups(
    true_atac_data,
    groupby='group',
    groups=['LYM'],
    reference='B',
    method='wilcoxon',
    pts=True
)

dap_true = sc.get.rank_genes_groups_df(true_atac_data, group='LYM')
dap_true['neg_log10_padj'] = -np.log10(dap_true['pvals_adj'].replace(0, np.nan))

pred_ATAC_babel.obs['group'] = pred_ATAC_babel.obs['Cell Types'].map(
    lambda x: 'B' if x in ['B'] else ('LYM' if x in ['Tumor B', 'Tumor B cycling'] else None)
)
pred_atac_babel_data = pred_ATAC_babel[pred_ATAC_babel.obs['group'].isin(['B','LYM'])].copy()

sc.pp.log1p(pred_atac_babel_data)

sc.tl.rank_genes_groups(
    pred_atac_babel_data,
    groupby='group',
    groups=['LYM'],
    reference='B',
    method='wilcoxon',
    pts=True
)

dap_pred_babel = sc.get.rank_genes_groups_df(pred_atac_babel_data, group='LYM')
dap_pred_babel['neg_log10_padj'] = -np.log10(dap_pred_babel['pvals_adj'].replace(0, np.nan))

pred_ATAC_sb.obs['group'] = pred_ATAC_sb.obs['Cell Types'].map(
    lambda x: 'B' if x in ['B'] else ('LYM' if x in ['Tumor B', 'Tumor B cycling'] else None)
)
pred_atac_sb_data = pred_ATAC_sb[pred_ATAC_sb.obs['group'].isin(['B','LYM'])].copy()

sc.pp.log1p(pred_atac_sb_data)

sc.tl.rank_genes_groups(
    pred_atac_sb_data,
    groupby='group',
    groups=['LYM'],
    reference='B',
    method='wilcoxon',
    pts=True
)

dap_pred_sb = sc.get.rank_genes_groups_df(pred_atac_sb_data, group='LYM')
dap_pred_sb['neg_log10_padj'] = -np.log10(dap_pred_sb['pvals_adj'].replace(0, np.nan))

Visualize with volcano plots

In [None]:
df = dap_true.copy()
df2 = dap_pred_babel.copy()
df3 = dap_pred_sb.copy()

fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axs = axes.flatten()

df['color'] = 'grey'
df.loc[(df['logfoldchanges'] > 1) & (df['pvals_adj'] < 0.001), 'color'] = '#fcad45'
df.loc[(df['logfoldchanges'] < -1) & (df['pvals_adj'] < 0.001), 'color'] = '#60b1dd'

axes[0, 0].scatter(df['logfoldchanges'], df['neg_log10_padj'], s=10, alpha=0.7, c=df['color'], edgecolor='none')
axes[0, 0].axhline(-np.log10(0.001), color='green', linestyle='--')
axes[0, 0].axvline(1, color='green', linestyle='dotted')
axes[0, 0].axvline(-1, color='green', linestyle='dotted')
axes[0, 0].set_xlim(-10, 10)
axes[0, 0].set_title('True ATAC data', fontsize=16)
axes[0, 0].spines['top'].set_visible(False)
axes[0, 0].spines['right'].set_visible(False)
axes[0, 0].set_xlabel('log\u2082(FC)', fontsize=14)
axes[0, 0].set_ylabel('-log\u2081\u2080(p-value)', fontsize=14)

df2['color'] = 'grey'
df2.loc[(df2['logfoldchanges'] > 1) & (df2['pvals_adj'] < 0.001), 'color'] = '#fcad45'
df2.loc[(df2['logfoldchanges'] < -1) & (df2['pvals_adj'] < 0.001), 'color'] = '#60b1dd'

axes[0,1].scatter(df2['logfoldchanges'], df2['neg_log10_padj'], s=10, alpha=0.7, c=df2['color'], edgecolor='none')
axes[0,1].axhline(-np.log10(0.001), color='green', linestyle='--')
axes[0,1].axvline(1, color='green', linestyle='dotted')
axes[0,1].axvline(-1, color='green', linestyle='dotted')
axes[0,1].set_xlim(-10, 10)
axes[0,1].set_title('Predicted BABEL', fontsize=16)
axes[0, 1].spines['top'].set_visible(False)
axes[0, 1].spines['right'].set_visible(False)
axes[0, 1].set_xlabel('log\u2082(FC)', fontsize=14)
axes[0, 1].set_ylabel('-log\u2081\u2080(p-value)', fontsize=14)

df3['color'] = 'grey'
df3.loc[(df3['logfoldchanges'] > 1) & (df3['pvals_adj'] < 0.001), 'color'] = '#fcad45'
df3.loc[(df3['logfoldchanges'] < -1) & (df3['pvals_adj'] < 0.001), 'color'] = '#60b1dd'
axes[1,0].scatter(df3['logfoldchanges'], df3['neg_log10_padj'], s=10, alpha=0.7, c=df3['color'], edgecolor='none')
axes[1,0].axhline(-np.log10(0.001), color='green', linestyle='--')
axes[1,0].axvline(1, color='green', linestyle='dotted')
axes[1,0].axvline(-1, color='green', linestyle='dotted')
axes[1,0].set_xlim(-10, 10)
axes[1,0].set_title('Predicted scButterfly', fontsize=16)
axes[1,0].spines['top'].set_visible(False)
axes[1,0].spines['right'].set_visible(False)
axes[1, 0].set_xlabel('log\u2082(FC)', fontsize=14)
axes[1, 0].set_ylabel('-log\u2081\u2080(p-value)', fontsize=14)

axes[1,1].axis('off')

plt.tight_layout()
plt.savefig('/workspace/Benchmarking/data_all/figures/dap_lymphoma.svg', format='svg', bbox_inches='tight')
plt.show()

Calculate DACS

In [None]:
dap_list_true= set(dap_true[(dap_true["pvals_adj"] < 0.05) & (abs(dap_true["logfoldchanges"]) > 1)]['names'].tolist())
dap_list_pred_babel = set(dap_pred_babel[(dap_pred_babel["pvals_adj"] < 0.05) & (abs(dap_pred_babel["logfoldchanges"]) > 1)]['names'].tolist())
dap_list_pred_sb = set(dap_pred_sb[(dap_pred_sb["pvals_adj"] < 0.05) & (abs(dap_pred_sb["logfoldchanges"]) > 1)]['names'].tolist())

jaccard_babel = len(dap_list_true.intersection(dap_list_pred_babel))/len(dap_list_true.union(dap_list_pred_babel))
jaccard_sb = len(dap_list_true.intersection(dap_list_pred_sb))/len(dap_list_true.union(dap_list_pred_sb))

print(jaccard_sb, jaccard_babel)

Plot Venn diagramms to visualize DAP overlap

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# Venn 1: True vs Babel
venn1 = venn2([dap_list_true, dap_list_pred_babel],
              set_labels=('DAPs from True Data', 'DAPs from Predicted BABEL'),
              set_colors=('#60b1dd', '#ef973f'),
              alpha=0.7,
              ax=axs[0])

# Venn 2: True vs scButterfly
venn2_plot = venn2([dap_list_true, dap_list_pred_sb],
                   set_labels=('DAPs from True Data', 'DAPs from Predicted scButterfly'),
                   set_colors=('#60b1dd', "#87e175"),
                   alpha=0.7,
                   ax=axs[1])

for venn in [venn1, venn2_plot]:
    for text in venn.set_labels:
        if text:
            text.set_visible(False)

legend_elements = [
    Patch(facecolor='#60b1dd', edgecolor='black', label='DAPs from True Data'),
    Patch(facecolor='#ef973f', edgecolor='black', label='DAPs from Predicted BABEL'),
    Patch(facecolor='#87e175', edgecolor='black', label='DAPs from Predicted scButterfly')
]

plt.tight_layout()
plt.legend(handles=legend_elements, loc='center right', bbox_to_anchor=(1.75, 0.5), ncol=1)
plt.savefig("/workspace/Benchmarking/data_all/figures/venn_dap_comparison.svg", format="svg", bbox_inches='tight')
plt.show()