In [1]:
import numpy as np, pandas as pd, scanpy as sc, matplotlib.pyplot as plt, os
from scipy.stats import hypergeom
import celloracle as co, glob, pickle
from functools import reduce
from tqdm import tqdm
import itertools, math, random
import networkx as nx
from itertools import combinations
from scipy.stats import gmean

# visualization settings required to see plots in jupyter notebook
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

wd = '/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/cicero_outs'
out_path = os.path.join(wd, 'out_data')
os.makedirs(f"{out_path}/figures", exist_ok=True)
os.makedirs(f"{out_path}/out_files", exist_ok=True)
sc.settings.figdir = f"{out_path}/figures"
random.seed(42)

In [2]:
!which bedtools

/opt/packages/bedtools/2.30.0/bedtools


In [3]:
from celloracle import motif_analysis as ma

ref_genome = "hg38"
genomes_dir = '/ocean/projects/cis240075p/skeshari/utils_data/genomes/'
genome_installation = ma.is_genome_installed(ref_genome=ref_genome, genomes_dir=genomes_dir)
print(ref_genome, "installation: ", genome_installation)
if not genome_installation:
    import genomepy
    genomepy.install_genome(name=ref_genome, provider="UCSC", genomes_dir=genomes_dir)
else:
    print(ref_genome, "is installed already.")

peaks = pd.read_csv(f"/ocean/projects/cis240075p/skeshari/igvf/bcell2/male_donor/out_data/cicero_output/all_peaks.csv", header=0, index_col=0)
peaks = peaks.x.values
cicero_connections = pd.read_csv(f"/ocean/projects/cis240075p/skeshari/igvf/bcell2/male_donor/out_data/cicero_output/cicero_connections.csv", header=0, index_col=0).dropna()
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg38")
integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated, cicero_connections=cicero_connections)

hg38 installation:  True
hg38 is installed already.


KI270728.1	232189	233167

KI270728.1	232189	233167



que bed peaks: 191255
tss peaks in que: 23030


In [None]:
genes2TFpairs = enrichment_df[['TF', 'common']].explode('common').groupby('common').agg({'TF': lambda x: list(set(x))}).reset_index().rename(columns={'TF': 'TFs'})
genes2TFpairs['peaks'] = [None] * len(genes2TFpairs)
genes2TFpairs['scores'] = [None] * len(genes2TFpairs)
genes2TFpairs['random_scores'] = [None] * len(genes2TFpairs)
genes2TFpairs['net_scores'] = [None] * len(genes2TFpairs)
for idx, row in genes2TFpairs.iterrows():
    gene = row['common']
    # Find the peaks associated with the gene
    peaks_for_gene = integrated[integrated['gene_short_name'] == gene][['peak_id', 'gene_short_name']].drop_duplicates().reset_index(drop=True)
    peaks2scoredict = integrated[integrated['gene_short_name'] == gene][['peak_id', 'coaccess']].set_index('peak_id')['coaccess'].to_dict()
    # scan the peaks to find TFs binding
    tfi = ma.TFinfo(peak_data_frame=peaks_for_gene, ref_genome=ref_genome, genomes_dir=genomes_dir) 
    tfi.scan(fpr=0.02, motifs=None, verbose=True) # If you enter None, default motifs will be loaded.
    # create binding peak lists for TF pairs
    sdf = tfi.scanned_df[tfi.scanned_df['score']>=10]
    sdf = sdf.drop(columns=['motif_id'])
    sdf['factors_direct'] = sdf['factors_direct'].str.upper().str.split(',').apply(lambda x: set(map(str.strip, x)))
    sdf['factors_indirect'] = sdf['factors_indirect'].str.upper().str.split(',').apply(lambda x: set(map(str.strip, x)))
    sdf['factors'] = sdf['factors_direct'].combine(sdf['factors_indirect'], lambda x, y: x.union(y))
    sdf = sdf.drop(columns=['factors_direct', 'factors_indirect', 'score', 'pos', 'strand']).explode('factors').dropna(subset=['factors'])
    sdf = sdf[sdf['factors'] != '']

    # Filter the scanned dataframe to only include TFs that are in the enrichment_df
    row_peaks, row_scores = [], []
    for pair in row['TFs']:
        peaks_for_pair = set(list(sdf[sdf['factors'] == pair[0]]['seqname']) + list(sdf[sdf['factors'] == pair[1]]['seqname']))
        if peaks_for_pair == set():
            average_score = 0
        else:
            # Calculate the average score for the peaks associated with the TF pair
            average_score = np.mean([peaks2scoredict.get(peak, 0) for peak in peaks_for_pair])
        row_peaks.append(peaks_for_pair)
        row_scores.append(average_score)
    rnd_pairs = random.sample(list(combinations(GRN_TFs, 2)), len(row['TFs']))  # Randomly sample pairs
    row_scores_rnd = []
    for pair in rnd_pairs:
        peaks_for_pair = set(list(sdf[sdf['factors'] == pair[0]]['seqname']) + list(sdf[sdf['factors'] == pair[1]]['seqname']))
        if peaks_for_pair == set():
            row_scores_rnd.append(0)
        else:
            average_score = np.mean([peaks2scoredict.get(peak, 0) for peak in peaks_for_pair])
            row_scores_rnd.append(average_score)
    predecessors = list(grn.predecessors(gene))
    if len(predecessors) == 0:
        genes2TFpairs.at[idx, 'net_scores'] = 0
    else:
        top_predecessors = sorted(predecessors, key=lambda x: combined_network_scores.loc[x, 'degree_centrality_out'], reverse=True)
        net_pairs = list(combinations(predecessors, 2))[:len(row['TFs'])]
    row_scores_net = []
    for pair in net_pairs:
        peaks_for_pair = set(list(sdf[sdf['factors'] == pair[0]]['seqname']) + list(sdf[sdf['factors'] == pair[1]]['seqname']))
        if peaks_for_pair == set():
            row_scores_net.append(0)
        else:
            average_score = np.mean([peaks2scoredict.get(peak, 0) for peak in peaks_for_pair])
            row_scores_net.append(average_score)

    genes2TFpairs.at[idx, 'peaks'] = list(row_peaks)
    genes2TFpairs.at[idx, 'scores'] = list(row_scores)
    genes2TFpairs.at[idx, 'random_scores'] = list(row_scores_rnd)
    genes2TFpairs.at[idx, 'net_scores'] = list(row_scores_net)

genes2TFpairs.to_csv(f"{out_path}/out_files/genes2TFpairs_{experiment}_{'_'.join(cluster_fusion)}.csv", index=False)