In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from matplotlib.patches import Patch
import episcanpy.preprocessing as epi_pp
import episcanpy.tools as epi_tl

### Compute Gene Activity scores for Lymphoma ATAC data

In [None]:
gene_activity_true = epi_tl.geneactivity(true_ATAC_sb, gtf_file='/workspace/Benchmarking/data_all/gencode.v48.annotation.gtf', feature_type='gene', key_added='gene_activity')
gene_activity_pred_babel = epi_tl.geneactivity(pred_ATAC_babel, gtf_file='/workspace/Benchmarking/data_all/gencode.v48.annotation.gtf', feature_type='gene', key_added='gene_activity')
gene_activity_pred_sb = epi_tl.geneactivity(pred_ATAC_sb, gtf_file='/workspace/Benchmarking/data_all/gencode.v48.annotation.gtf', feature_type='gene', key_added='gene_activity')

Calculate GACS

In [None]:
sc.pp.normalize_total(gene_activity_true, target_sum=1e6)
sc.pp.log1p(gene_activity_true)
sc.pp.highly_variable_genes(gene_activity_true, n_top_genes=3000)

sc.pp.normalize_total(gene_activity_pred_babel, target_sum=1e6)
sc.pp.log1p(gene_activity_pred_babel)
sc.pp.highly_variable_genes(gene_activity_pred_babel, n_top_genes=3000)

sc.pp.normalize_total(gene_activity_pred_sb, target_sum=1e6)
sc.pp.log1p(gene_activity_pred_sb)
sc.pp.highly_variable_genes(gene_activity_pred_sb, n_top_genes=3000)

hvg_true = set(gene_activity_true[:,gene_activity_true.var['highly_variable']].var_names)
hvg_pred_babel = set(gene_activity_pred_babel[:,gene_activity_pred_babel.var['highly_variable']].var_names)
hvg_pred_sb = set(gene_activity_pred_sb[:,gene_activity_pred_sb.var['highly_variable']].var_names)
hvg_rna = set(true_RNA_sb[:,true_RNA_sb.var['highly_variable']].var_names)

jaccard_true = len(hvg_rna.intersection(hvg_true))/len(hvg_rna.union(hvg_true))
jaccard_babel = len(hvg_rna.intersection(hvg_pred_babel))/len(hvg_rna.union(hvg_pred_babel))
jaccard_sb = len(hvg_rna.intersection(hvg_pred_sb))/len(hvg_rna.union(hvg_pred_sb))

print(jaccard_true)
print(jaccard_babel)
print(jaccard_sb)

In [None]:
gacs_babel = abs(jaccard_true - jaccard_babel)/jaccard_true
gacs_sb = abs(jaccard_true - jaccard_sb)/jaccard_true

print(gacs_babel)
print(gacs_sb)

Plot Venn diagramms

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 4))

axs = axs.flatten()

# Venn 1: True vs Babel
venn1 = venn2([hvg_true, hvg_rna],
              set_labels=('DAPs from True Data', 'DAPs from Predicted BABEL'),
              set_colors=('#b992cd', '#60b1dd'),
              alpha=0.7,
              ax=axs[0])

# Venn 2: True vs scButterfly
venn2_plot = venn2([hvg_pred_babel, hvg_rna],
                   set_labels=('DAPs from True Data', 'DAPs from Predicted scButterfly'),
                   set_colors=("#87e175", '#60b1dd'),
                   alpha=0.7,
                   ax=axs[1])

venn3_plot = venn2([hvg_pred_sb, hvg_rna],
                   set_labels=('HVGs from True Data', 'HVGs from Predicted BABEL'),
                   set_colors=('#ef973f', '#60b1dd'),
                   alpha=0.7,
                   ax=axs[2])
 
axs[3].axis('off')  # Hide the last subplot
# Improve label positioning if needed
for venn in [venn1, venn2_plot, venn3_plot]:
    for text in venn.set_labels:
        if text:
            text.set_visible(False)

legend_elements = [ 
    Patch(facecolor='#60b1dd', edgecolor='black', label='HVGs from RNA data'),
    Patch(facecolor='#b992cd', edgecolor='black', label='HVGs from True ATAC'),
    Patch(facecolor='#ef973f', edgecolor='black', label='HVGs from Predicted BABEL'),
    Patch(facecolor='#87e175', edgecolor='black', label='HVGs from Predicted scButterfly')
]

plt.tight_layout()
plt.legend(handles=legend_elements, loc='center right', bbox_to_anchor=(0.8, 0.5), ncol=1)

plt.savefig("/workspace/Benchmarking/data_all/figures/venn_gacs.svg", format="svg", bbox_inches='tight')
plt.show()

### Compute Gene Activity scores for BMMC ATAC data

In [None]:
true_ATAC_bmmc_test.var_names = true_ATAC_bmmc_test.var_names.str.replace(r'^([^:]+?)-', r'\1:', regex=True)
pred_ATAC_bmmc_sb.var_names = pred_ATAC_bmmc_sb.var_names.str.replace(r'^([^:]+?)-', r'\1:', regex=True)

gene_activity_true_bmmc = epi_tl.geneactivity(true_ATAC_bmmc_test, gtf_file='/workspace/Benchmarking/data_all/gencode.v48.annotation.gtf', feature_type='gene', key_added='gene_activity')
gene_activity_pred_bmmc_sb = epi_tl.geneactivity(pred_ATAC_bmmc_sb, gtf_file='/workspace/Benchmarking/data_all/gencode.v48.annotation.gtf', feature_type='gene', key_added='gene_activity')

In [None]:
hvg_true_bmmc = set(gene_activity_true_bmmc[:,gene_activity_true_bmmc.var['highly_variable']].var_names)
hvg_pred_bmmc_sb = set(gene_activity_pred_bmmc_sb[:,gene_activity_pred_bmmc_sb.var['highly_variable']].var_names)
hvg_rna_bmmc = set(true_RNA_bmmc_test[:,true_RNA_bmmc_test.var['highly_variable']].var_names)

jaccard_true_bmmc = len(hvg_rna_bmmc.intersection(hvg_true_bmmc))/len(hvg_rna_bmmc.union(hvg_true_bmmc))
jaccard_sb_bmmc = len(hvg_rna_bmmc.intersection(hvg_pred_bmmc_sb))/len(hvg_rna_bmmc.union(hvg_pred_bmmc_sb))

print(jaccard_true_bmmc)
print(jaccard_sb_bmmc)

In [None]:
gacs_bmmc_sb = abs(jaccard_true_bmmc - jaccard_sb_bmmc)/jaccard_true_bmmc