In [None]:
# Temporal - Intersection between enhancers

In [18]:
CELLS = ['GM12878', 'HepG2', 'K562', 'A549', 'H1']

In [19]:
import pandas as pd

# Read enhancers for all cells
df_enhancers = {}
for origin_cell in CELLS:
    enh = pd.read_csv(f'data/{origin_cell}_enhancers_hg38.bed', sep='\t', header=None)

    # Drop rows where chr has underscore
    enh = enh[~enh[0].str.contains('_')]

    df_enhancers[origin_cell] = enh


In [20]:
cell1 = 'GM12878'
cell2 = 'H1'

In [21]:
import numpy as np
from tqdm import tqdm

# Rename columns for clarity
df_1 = df_enhancers['GM12878'].rename(columns={0: 'chrom', 1: 'start', 2: 'end', 3: 'score'})
df_2 = df_enhancers['HepG2'].rename(columns={0: 'chrom', 1: 'start', 2: 'end', 3: 'score'})

overlap_pairs = []

# Find common chromosomes
common_chroms = set(df_1['chrom']).intersection(set(df_2['chrom']))

for chrom in tqdm(common_chroms):
    # Select intervals for the current chromosome
    intervals_1 = df_1[df_1['chrom'] == chrom]
    intervals_2 = df_2[df_2['chrom'] == chrom].sort_values('start')
    
    for _, row_1 in intervals_1.iterrows():
        start_1, end_1 = row_1['start'], row_1['end']
        length_1 = end_1 - start_1

        # Vectorized candidate selection for HepG2 intervals that might overlap
        candidates = intervals_2[(intervals_2['end'] > start_1) & (intervals_2['start'] < end_1)]
        if candidates.empty:
            continue
        
        # Convert candidate columns to NumPy arrays for fast arithmetic
        starts_2 = candidates['start'].values
        ends_2 = candidates['end'].values
        lengths_2 = ends_2 - starts_2
        
        # Compute the overlapping region vectorized
        overlap = np.minimum(end_1, ends_2) - np.maximum(start_1, starts_2)
        
        # Determine valid overlaps with at least 50% reciprocal overlap
        valid = (overlap > 0) & ((overlap / length_1 >= 0.5) & (overlap / lengths_2 >= 0.5))
        if not np.any(valid):
            continue
        
        # Add each valid overlap to the result list
        for start_2, end_2, ov in zip(starts_2[valid], ends_2[valid], overlap[valid]):
            overlap_pairs.append({
                'chrom': chrom,
                'start_1': start_1,
                'end_1': end_1,
                'start_2': start_2,
                'end_2': end_2,
                'overlap': ov
            })

total_count = len(overlap_pairs)
print("Total overlapping pairs (>=50% reciprocal overlap):", total_count)


100%|██████████| 23/23 [00:15<00:00,  1.53it/s]

Total overlapping pairs (>=50% reciprocal overlap): 5803





In [22]:
df_gm

Unnamed: 0,chrom,start,end,score
0,chr1,837920,838720,7.866088
1,chr1,843600,844070,6.472419
2,chr1,864720,866620,11.010675
3,chr1,890290,891030,6.114487
4,chr1,904090,907210,8.848865
...,...,...,...,...
50496,chrX,155216573,155217383,9.207927
50497,chrX,155229448,155230378,5.836427
50498,chrX,155230838,155231708,8.231231
50499,chrX,155611949,155612789,7.930245


In [23]:
df_hep

Unnamed: 0,chrom,start,end,score
0,chr1,24890,29320,11.858691
1,chr1,264429,268369,9.422922
2,chr1,777020,778280,14.141722
3,chr1,787820,792000,9.859360
4,chr1,859520,868070,7.766441
...,...,...,...,...
54172,chrY,56673214,56771509,13.824542
54173,chrY,56824923,56852143,16.504257
54174,chrY,56853803,56858233,9.962317
54175,chrY,56858333,56877703,7.755760


In [24]:
overlap_pairs

[{'chrom': 'chr11',
  'start_1': 289240,
  'end_1': 289800,
  'start_2': 289240,
  'end_2': 289800,
  'overlap': 560},
 {'chrom': 'chr11',
  'start_1': 295730,
  'end_1': 298200,
  'start_2': 295690,
  'end_2': 298200,
  'overlap': 2470},
 {'chrom': 'chr11',
  'start_1': 308440,
  'end_1': 308990,
  'start_2': 308440,
  'end_2': 308990,
  'overlap': 550},
 {'chrom': 'chr11',
  'start_1': 315270,
  'end_1': 319670,
  'start_2': 315270,
  'end_2': 319670,
  'overlap': 4400},
 {'chrom': 'chr11',
  'start_1': 348140,
  'end_1': 351720,
  'start_2': 347420,
  'end_2': 351880,
  'overlap': 3580},
 {'chrom': 'chr11',
  'start_1': 354420,
  'end_1': 358090,
  'start_2': 351980,
  'end_2': 359050,
  'overlap': 3670},
 {'chrom': 'chr11',
  'start_1': 450640,
  'end_1': 451940,
  'start_2': 450640,
  'end_2': 452880,
  'overlap': 1300},
 {'chrom': 'chr11',
  'start_1': 788850,
  'end_1': 790020,
  'start_2': 788660,
  'end_2': 790020,
  'overlap': 1170},
 {'chrom': 'chr11',
  'start_1': 804460,
 