## Housekeeping Settings

In [1]:
import numpy as np, pandas as pd, scanpy as sc, matplotlib.pyplot as plt,os
import multiprocessing as mp, pickle
from tqdm import tqdm

# visualization settings required to see plots in jupyter notebook
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

wd = '/ocean/projects/cis240075p/skeshari/igvf/bcell2/male_donor/'
out_path = os.path.join(wd, 'out_data', 'cicero_lf_enrich')
os.makedirs(f"{out_path}/figures", exist_ok=True)
os.makedirs(f"{out_path}/out_files", exist_ok=True)
sc.settings.figdir = f"{out_path}/figures"

# Creating Cicero index dictionary for atac

In [2]:
def build_cicero_conns_matrix(batch):
    idx0 = batch['Peak1_idx'].to_numpy(dtype=np.int64)
    idx1 = batch['Peak2_idx'].to_numpy(dtype=np.int64)
    values = batch['coaccess'].to_numpy(dtype=np.float16)

    mask = idx0 <= idx1
    upper_indices = np.stack([idx0[mask], idx1[mask]], axis=0)
    upper_values = values[mask]
    lower_indices = np.stack([idx1[~mask], idx0[~mask]], axis=0)
    lower_values = values[~mask]
    final_upper_indices = np.concatenate([upper_indices, lower_indices], axis=1)
    final_upper_values = np.concatenate([upper_values, lower_values])
    assert final_upper_indices[1].all() <= final_upper_values[0].all() , f"Assertion for upper indices failed"
    return final_upper_indices, final_upper_values

In [3]:
# Build Cicero connections matrix
cicero_conns = pd.read_csv(f"{wd}/out_data/cicero_output/cicero_connections.csv", header=0, index_col=0).dropna()
cicero_conns = cicero_conns[cicero_conns['coaccess'] != 0].reset_index(drop=True)
# Precompute indices
cicero_peaks_set = pd.concat([cicero_conns['Peak1'], cicero_conns['Peak2']]).unique()
cicero_peaks_index = {peak: idx for idx, peak in enumerate(sorted(cicero_peaks_set))}
with open(f"{out_path}/out_files/cicero_peaks_indexed.pkl", 'wb') as f:
    pickle.dump(cicero_peaks_index, f)

cicero_conns['Peak1_idx'] = cicero_conns['Peak1'].map(cicero_peaks_index)
cicero_conns['Peak2_idx'] = cicero_conns['Peak2'].map(cicero_peaks_index)
num_peaks = len(cicero_peaks_index)

batch_size = int(1e6)
batches = [cicero_conns.iloc[i:i + batch_size] for i in range(0, len(cicero_conns), batch_size)]
# Process batches
matrix = []
for batch in tqdm(batches, total=len(batches), desc='Building Cicero connections matrix'):
    matrix.append(build_cicero_conns_matrix(batch))
# Combine results
upper_indices, upper_values = matrix[0]
for mtx in matrix[1:]:
    upper_indices = np.concatenate([upper_indices, mtx[0]], axis=1)
    upper_values = np.concatenate([upper_values, mtx[1]])
index_value_dict = {(upper_indices[0, i], upper_indices[1, i]): upper_values[i] for i in range(upper_indices.shape[1])}
with open(f"{out_path}/out_files/matrix_index_value.pkl", 'wb') as f:
    pickle.dump(index_value_dict, f)
# del cicero_conns, matrix
# torch.cuda.empty_cache()


Building Cicero connections matrix: 100%|██████████| 9/9 [00:00<00:00, 46.47it/s]


In [2]:
from celloracle import motif_analysis as ma
processed_peaks= pd.read_csv(f"{wd}/out_data/out_other_methods/processed_peak_file.csv", header=0, index_col=0)
peaks = pd.read_csv(f"{wd}/out_data/out_other_methods/all_peaks.csv", header=0, index_col=0)
peaks = peaks.x.values

cicero_connections = pd.read_csv(f"{wd}/out_data/cicero_output/cicero_connections.csv", header=0, index_col=0).dropna()
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg38")
integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated, cicero_connections=cicero_connections)
peak = integrated

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:
KI270728.1	232189	233167

KI270728.1	232189	233167



que bed peaks: 191255
tss peaks in que: 23030


In [4]:
# PLEASE make sure reference genome is correct.
ref_genome = "hg38"
genomes_dir = "/ocean/projects/cis240075p/skeshari/utils_data/genomes"
genome_installation = ma.is_genome_installed(ref_genome=ref_genome, genomes_dir=genomes_dir)
print(ref_genome, "installation: ", genome_installation)
if not genome_installation:
    import genomepy
    genomepy.install_genome(name=ref_genome, provider="UCSC", genomes_dir=genomes_dir)
else:
    print(ref_genome, "is installed.")
peaks = ma.check_peak_format(processed_peaks, ref_genome, genomes_dir=genomes_dir)

genome hg38 is not installed in this environment.
Please install genome using genomepy.
e.g.
    >>> import genomepy
    >>> genomepy.install_genome(name="hg38", provider="UCSC", genomes_dir=/ocean/projects/cis240075p/skeshari/utils_data/genomes)
hg38 installation:  False


[32m09:32:16[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from UCSC. Target URL: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz...


Download:   0%|          | 0.00/938M [00:00<?, ?B/s]

[32m09:32:38[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m09:32:53[0m [1m|[0m [34mINFO[0m [1m|[0m name: hg38
[32m09:32:53[0m [1m|[0m [34mINFO[0m [1m|[0m local name: hg38
[32m09:32:53[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /ocean/projects/cis240075p/skeshari/utils_data/genomes/hg38/hg38.fa


Filtering Fasta: 0.00 lines [00:00, ? lines/s]

Peaks before filtering:  21158
Peaks with invalid chr_name:  0
Peaks with invalid length:  0
Peaks after filtering:  21158


In [13]:
tfi = ma.TFinfo(peak_data_frame=peaks, ref_genome=ref_genome, genomes_dir=genomes_dir) 
# If you enter None, default motifs will be loaded
tfi.scan(fpr=0.00, motifs=None, verbose=True, n_cpus=2)
# Save tfinfo object
# tfi.to_hdf5(file_path=f"{out_path}/out_files/tfi.celloracle.tfinfo")

No motif data entered. Loading default motifs for your species ...
 Default motif for vertebrate: gimme.vertebrate.v5.0. 
 For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html 

Initiating scanner... 



2025-05-16 11:53:27,296 - DEBUG - using background: genome hg38 with size 200


TypeError: set_background() got an unexpected keyword argument 'length'

In [20]:
input_dict = {
    'experiment': ['PRDM1_KO', 'IRF4_KO', 'GC_PB', 'PB_ABC', 'GC_ABC'],
    'slide_starting_genes': [4472, 4500, 4725, 3420, 4603],
    'clusters_of_interest': [['1','2','3','7'], ['1','2','3','7'], ['7','3'], ['1','7'], ['1','3']],
    'order_fr_clust': [[1], [1], [2], [2], [2]],
    'order_fr_tfcomb': [[1], [1], [1,2], [2], [2]],
    'weight': ['strength', 'strength', 'strength', 'strength', 'strength'],
}
input_df = pd.DataFrame(input_dict)
#### Assign the input parameters
i=2
experiment = input_df['experiment'][i]
slide_starting_genes = input_df['slide_starting_genes'][i]
clusters_of_interest = input_df['clusters_of_interest'][i]
order_fr_clust = input_df['order_fr_clust'][i]
order_fr_tfcomb = input_df['order_fr_tfcomb'][i]
weight = input_df['weight'][i]
#### Read TFs
cluster_fusion = ('7','3')
slide_tot_TF = pd.read_csv(f"{wd}/out_data/lf_enrich/out_files/SLIDE_LF_{cluster_fusion}_{experiment}.csv", header=0, index_col=0).index.values
net_match_TF = pd.read_csv(f"{wd}/out_data/lf_enrich/out_files/Net_match_{cluster_fusion}_{experiment}.csv", header=0, index_col=0).index.values
net_rnd_TF = pd.read_csv(f"{wd}/out_data/lf_enrich/out_files/Net_rnd_{cluster_fusion}_{experiment}.csv", header=0, index_col=0).index.values

In [22]:
net_rnd_TF

array(['EZH1', 'GTF2B', 'BATF3', 'HDX', 'ID3', 'ESR2', 'ZNF254', 'MAML3',
       'TFAP2E', 'HIVEP2', 'TET3', 'ID2', 'PHF21A', 'MYC', 'FOSB',
       'MLLT3', 'EBF1', 'BBX', 'NFKB2', 'RUNX1', 'HIC2', 'PRDM4', 'SMAD3',
       'CLOCK', 'CTNNB1', 'HDAC4', 'ZNF407', 'ZNF165', 'KDM5B', 'NFIA',
       'ZEB2', 'EGR1', 'FOXO3', 'JUNB', 'DCP1A'], dtype=object)

In [9]:
peak

Unnamed: 0,peak_id,gene_short_name,coaccess
0,chr10_100006032_100006913,BLOC1S2,0.189921
1,chr10_100006032_100006913,CHUK,0.018551
2,chr10_100006032_100006913,DNMBP,0.129631
3,chr10_100006032_100006913,ERLIN1,0.110668
4,chr10_100006032_100006913,SCD,0.012435
...,...,...,...
530101,chrY_7273456_7274293,PRKY,1.000000
530102,chrY_7722551_7722998,PRKY,0.030742
530103,chrY_7724032_7724886,PRKY,0.402823
530104,chrY_7728766_7729649,PRKY,0.041167


In [6]:
processed_peaks

Unnamed: 0,peak_id,gene_short_name
0,chr10_100009467_100010353,DNMBP
1,chr10_100185591_100186499,ERLIN1
2,chr10_100229001_100229836,CHUK
3,chr10_100238201_100239112,SNORA12
4,chr10_100267176_100268025,CWF19L1
...,...,...
21153,chrY_2934812_2935448,ZFY
21154,chrY_2935576_2936328,ZFY
21155,chrY_3002530_3003318,LINC00278
21156,chrY_6910256_6911183,TBL1Y


In [7]:
input_dict = {
    'experiment': ['PRDM1_KO', 'IRF4_KO', 'GC_PB', 'PB_ABC', 'GC_ABC'],
    'slide_starting_genes': [4472, 4500, 4725, 3420, 4603],
    'clusters_of_interest': [['1','2','3','7'], ['1','2','3','7'], ['7','3'], ['1','7'], ['1','3']],
    'order_fr_clust': [[1], [1], [2], [2], [2]],
    'order_fr_tfcomb': [[1], [1], [1,2], [1], [1]],
    'weight': ['coef_mean', 'coef_mean', 'coef_mean', 'coef_mean', 'coef_mean'],
}
input_df = pd.DataFrame(input_dict)

In [8]:
def get_binding_peaks_for_motif_pair(motif_name1, motif_name2):
    motif_arrays = []
    for motif in [motif_name1, motif_name2]:
        motif_df = pd.read_csv(f"{out_path}/out_files/txt_files_for_cicero/{motif}.txt", sep='\t', header=0)[['chrom', 'start', 'end']].drop_duplicates().reset_index(drop=True)
        motif_df['concat'] = motif_df['chrom'].astype(str) + "_" + motif_df['start'].astype(str) + "_" + motif_df['end'].astype(str)
        motif_df['peak_index'] = motif_df['concat'].map(cicero_peaks_index)
        motif_df = motif_df.dropna()
        motif_arrays.append(np.array(motif_df['peak_index'].values, dtype=np.int32))
    return motif_arrays

def motif_peak_pair_intersection_cicero_connections(motif1_peak_idx_chunk):
    result_chunk = []
    for idx1 in motif1_peak_idx_chunk:
        for idx2 in motif2_peak_idx:
            try:
                if idx1 <= idx2:
                    result_chunk.append((idx1, idx2, index_value_dict[(idx1, idx2)]))
                else:
                    result_chunk.append((idx2, idx1, index_value_dict[(idx2, idx1)]))
            except KeyError:
                pass
    return result_chunk

In [9]:
# Directory containing the text files with JASPAR motifs
cluster_pairs = [('GC-1','PB-2' ),('PB-2', 'ActB-2'),('GC-1','ActB-2')]
for cluster_pair in cluster_pairs[0:1]:
    # Curating Cicero results
    corr_peak_for_motif_pair = []
    num_chunks = os.cpu_count()
    for preferred_motif_set_name in tqdm(comb_of_interest[0:1], desc='Processing combinations for Cicero'):
        motif1_peak_idx, motif2_peak_idx = get_binding_peaks_for_motif_pair(*preferred_motif_set_name)
        union_peak_index = set(np.concatenate([motif1_peak_idx, motif2_peak_idx]))
        print(f"STEP3: Binding peaks for motifs {preferred_motif_set_name} have been loaded")
        chunks = np.array_split(motif1_peak_idx, num_chunks)
        with mp.Pool(num_chunks) as pool:
            results = pool.map(motif_peak_pair_intersection_cicero_connections, chunks)
        print(f"STEP4: Pair of peaks found in cicero connections has been counted for {preferred_motif_set_name}")
        result = [item for sublist in results for item in sublist]
        cnt_thr_pos_idx = len([x for x in result if abs(x[2]) >= 0.05]) 
        jaccard_index = cnt_thr_pos_idx / len(union_peak_index)
        corr_peak_for_motif_pair.append((*preferred_motif_set_name, cnt_thr_pos_idx, len(union_peak_index), jaccard_index))

    # # Save the results
    # cicero_results_df = pd.DataFrame(corr_peak_for_motif_pair, columns=['TF1', 'TF2', 'cnt_thr_pos_idx', 'union_peak_index', 'jaccard_index'])
    # cicero_results_df.to_csv(f"{out_path}/out_files/cicero_correlation_results.csv", index=False)
    # print("Results have been saved")



NameError: name 'cluster_pairs' is not defined

# Plotting Code

In [70]:
cluster_pairs = [('GC-1','PB-2' ),('PB-2', 'ActB-2'),('GC-1','ActB-2')]
cluster_pair = cluster_pairs[0]
slide_enriched_tfs_scores = pd.read_csv(f"{wd}/out_data/comb_ctrl/out_files/SLIDE_LF_enriched_double_TFs_{cluster_pair}.csv").set_index(['TF1', 'TF2'])['Score'].to_dict()
ceseek_scores = pd.read_csv(f"{out_path}/out_files/CEseek_results.csv").set_index(['Combination'])
ceseek_scores = ceseek_scores['Max_Score'][ceseek_scores['P-value'] < 0.1].to_dict()
ceseek_scores = {eval(k): v for k, v in ceseek_scores.items()}
cicero_correlation_scores = pd.read_csv(f"{out_path}/out_files/cicero_correlation_results.csv")
cicero_correlation_scores ['ratio'] = cicero_correlation_scores['cnt_thr_pos_idx'] / cicero_correlation_scores['possibilities']
cicero_correlation_scores = cicero_correlation_scores.set_index(['TF1', 'TF2'])['ratio'].to_dict()

In [71]:
shuffle_enriched_tfs = pd.read_csv(f"{wd}/out_data/comb_ctrl/out_files/shuffled_SLIDE_LF_enriched_double_TFs_{cluster_pair}.csv")
network_score_tfs = pd.read_csv(f"{wd}/out_data/comb_ctrl/out_files/CO_network_score_double_TFs_{cluster_pair}.csv")
random_tfs = pd.read_csv(f"{wd}/out_data/comb_ctrl/out_files/random_double_TFs_{cluster_pair}.csv")

actual_pairs = list(slide_enriched_tfs_scores.keys())
shuffle_enriched_tfs_pairs = list(zip(shuffle_enriched_tfs['TF1'], shuffle_enriched_tfs['TF2']))
network_score_tfs_pairs = list(zip(network_score_tfs['TF1'], network_score_tfs['TF2']))
random_tfs_pairs = list(zip(random_tfs['TF1'], random_tfs['TF2']))

In [76]:
def get_binding_peaks_for_motif(motif):
    motif_df = pd.read_csv(f"{out_path}/out_files/txt_files_for_cicero/{motif}.txt", sep='\t', usecols=['chrom', 'start', 'end']).drop_duplicates()
    motif_df['concat'] = motif_df.apply(lambda row: f"{row['chrom']}:{row['start']}-{row['end']}", axis=1)
    return motif_df
def rp_scores_for_motif_pair(preferred_motif_set_name): 
    scores = []
    motif1_peak_idx = get_binding_peaks_for_motif(preferred_motif_set_name[0])
    motif2_peak_idx = get_binding_peaks_for_motif(preferred_motif_set_name[1])
    motif1_scores = rp_scores[rp_scores['gene_ids'].isin(motif1_peak_idx['concat'])].sort_values('rp_peak_score', ascending=False).drop_duplicates('gene_ids')['rp_peak_score'].values
    motif2_scores = rp_scores[rp_scores['gene_ids'].isin(motif2_peak_idx['concat'])].sort_values('rp_peak_score', ascending=False).drop_duplicates('gene_ids')['rp_peak_score'].values
    scores.extend(motif1_scores)
    scores.extend(motif2_scores)
    return np.mean(scores)
def calculate_scores(pairs, ceseek_scores, cicero_correlation_scores):
    ceseek_scores_list = []
    cicero_correlation_scores_list = []
    rp_peak_scores_list = []
    for pair in pairs:
        if pair in ceseek_scores:
            ceseek_score = ceseek_scores[pair]
            ceseek_scores_list.append(ceseek_score)
        else:
            ceseek_scores_list.append(None)
        
        if pair in cicero_correlation_scores:
            cicero_correlation_score = cicero_correlation_scores[pair]
            cicero_correlation_scores_list.append(cicero_correlation_score)
        else:
            cicero_correlation_scores_list.append(None)
    try:
        with mp.Pool(10) as pool:
            rp_peak_scores_list = pool.map(rp_scores_for_motif_pair, pairs)
    except Exception as e:
        print(e)
        rp_peak_scores_list.append(None)
  
    return pd.DataFrame({
        'Pair': pairs,
        'CEseek_Score': ceseek_scores_list,
        'Cicero_Correlation_Score': cicero_correlation_scores_list,
        'RP_Peak_Score': rp_peak_scores_list
    })

# Calculate results for each set of pairs
results_df = calculate_scores(actual_pairs, ceseek_scores, cicero_correlation_scores)
shuffle_results_df = calculate_scores(shuffle_enriched_tfs_pairs, ceseek_scores, cicero_correlation_scores)
network_results_df = calculate_scores(network_score_tfs_pairs, ceseek_scores, cicero_correlation_scores)
random_results_df = calculate_scores(random_tfs_pairs, ceseek_scores, cicero_correlation_scores)
print(len(results_df), len(shuffle_results_df), len(network_results_df), len(random_results_df))

103 103 103 103


In [77]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_all_in_one(results_df, shuffle_results_df, network_results_df, random_results_df):
    fig = make_subplots(rows=1, cols=3, subplot_titles=("CEseek_Score", "Cicero_Correlation_Score", "RP_Peak_Score"))

    # Add traces for results_df
    fig.add_trace(go.Box(y=np.log10(results_df['CEseek_Score'].dropna() + 1), name=f'Actual (n={results_df["CEseek_Score"].dropna().shape[0]})', boxmean=True, marker_color='blue'), row=1, col=1)
    fig.add_trace(go.Box(y=np.log10(shuffle_results_df['CEseek_Score'].dropna() + 1), name=f'Shuffle (n={shuffle_results_df["CEseek_Score"].dropna().shape[0]})', boxmean=True, marker_color='orange'), row=1, col=1)
    fig.add_trace(go.Box(y=np.log10(network_results_df['CEseek_Score'].dropna() + 1), name=f'Network (n={network_results_df["CEseek_Score"].dropna().shape[0]})', boxmean=True, marker_color='green'), row=1, col=1)
    fig.add_trace(go.Box(y=np.log10(random_results_df['CEseek_Score'].dropna() + 1), name=f'Random (n={random_results_df["CEseek_Score"].dropna().shape[0]})', boxmean=True, marker_color='red'), row=1, col=1)

    fig.add_trace(go.Box(y=results_df['Cicero_Correlation_Score'].dropna(), name=f'Actual (n={results_df["Cicero_Correlation_Score"].dropna().shape[0]})', boxmean=True, marker_color='blue'), row=1, col=2)
    fig.add_trace(go.Box(y=shuffle_results_df['Cicero_Correlation_Score'].dropna(), name=f'Shuffle (n={shuffle_results_df["Cicero_Correlation_Score"].dropna().shape[0]})', boxmean=True, marker_color='orange'), row=1, col=2)
    fig.add_trace(go.Box(y=network_results_df['Cicero_Correlation_Score'].dropna(), name=f'Network (n={network_results_df["Cicero_Correlation_Score"].dropna().shape[0]})', boxmean=True, marker_color='green'), row=1, col=2)
    fig.add_trace(go.Box(y=random_results_df['Cicero_Correlation_Score'].dropna(), name=f'Random (n={random_results_df["Cicero_Correlation_Score"].dropna().shape[0]})', boxmean=True, marker_color='red'), row=1, col=2)

    fig.add_trace(go.Box(y=results_df['RP_Peak_Score'].dropna(), name=f'Actual (n={results_df["RP_Peak_Score"].dropna().shape[0]})', boxmean=True, marker_color='blue'), row=1, col=3)
    fig.add_trace(go.Box(y=shuffle_results_df['RP_Peak_Score'].dropna(), name=f'Shuffle (n={shuffle_results_df["RP_Peak_Score"].dropna().shape[0]})', boxmean=True, marker_color='orange'), row=1, col=3)
    fig.add_trace(go.Box(y=network_results_df['RP_Peak_Score'].dropna(), name=f'Network (n={network_results_df["RP_Peak_Score"].dropna().shape[0]})', boxmean=True, marker_color='green'), row=1, col=3)
    fig.add_trace(go.Box(y=random_results_df['RP_Peak_Score'].dropna(), name=f'Random (n={random_results_df["RP_Peak_Score"].dropna().shape[0]})', boxmean=True, marker_color='red'), row=1, col=3)

    fig.update_layout(height=600, width=1800, title_text="CEseek and Cicero Correlation Scores")
    fig.show()
    fig.write_html(f"{out_path}/figures/CEseek_Cicero_Scores.html")

plot_all_in_one(results_df, shuffle_results_df, network_results_df, random_results_df)