**[iEnhancer-2L](https://academic.oup.com/bioinformatics/article/32/3/362/1744331)** ➡ H1ES, **K562**, **GM12878**, HepG2, **HUVEC**, HSMM, NHLF, **NHEK**, HMEC

**[TargetFinder](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4910881/)** ➡ **GM12878**, **K562**, IMR90, HeLa-S3, **HUVEC**, **NHEK**

**[Xin Zeng (2021)](https://www.frontiersin.org/articles/10.3389/fgene.2021.681259/full#B4)** ➡ **K562** (sequence length=500 in US_UU dataset) and [CAGE](https://academic.oup.com/genetics/article/211/4/1205/5931517) (sequence length=600 in CAGE dataset)

In [1]:
import os
import sys
import random
import re
import numpy as np
import pandas as pd
import pickle as pkl
from IPython.display import display
from Bio import SeqIO

In [37]:
def load_ep_pairs(cell_line):

    ep_pairs = pd.read_csv('https://raw.githubusercontent.com/shwhalen/' \
                                   'targetfinder/master/paper/targetfinder/{}/' \
                                   'output-ep/pairs.csv'.format(cell_line))
    ep_pairs = ep_pairs[ep_pairs['label'] == 1].reset_index(drop=True)
    ep_pairs = ep_pairs[['enhancer_name', 'enhancer_chrom', 'enhancer_start', 'enhancer_end',
                        'promoter_name', 'promoter_chrom', 'promoter_start', 'promoter_end', 'label']]
    return ep_pairs

In [40]:
def load_ep_sequences(cell_line):
    # DOWNLOAD HUMAN GENOME v37 (3.2 Gb)
    # Older version but compatible with genomic coordinates of TargetFinder dataset
    # https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml
    # https://github.com/shwhalen/targetfinder/tree/master/paper/targetfinder

    print('Parsing GRCh37 genome...')
    hg37 = SeqIO.to_dict(SeqIO.parse('data/GRCh37_latest_genomic.fna', 'fasta'))

    RefSeqIDs = []

    for k in hg37.keys():
        if k.startswith('NC_0000'):
            RefSeqIDs.append(hg37[k].id)

    chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', \
               'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', \
               'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

    RefSeqDict = {chromosomes[i]: RefSeqIDs[i] for i in range(len(chromosomes))}

    enhancer_sequences = []
    promoter_sequences = []
    
    print('Loading EP pairs for {} dataset...'.format(cell_line))
    ep_pairs = load_ep_pairs(cell_line)
    
    n = len(ep_pairs)

    print('Getting DNA sequences for {} EP pairs...'.format(n))

    for i in range(n):
        enhancer_seq_id = ep_pairs['enhancer_chrom'][i]
        enhancer_seq_start = ep_pairs['enhancer_start'][i] - 1
        enhancer_seq_end = ep_pairs['enhancer_end'][i]

        promoter_seq_id = ep_pairs['promoter_chrom'][i]
        promoter_seq_start = ep_pairs['promoter_start'][i] - 1
        promoter_seq_end = ep_pairs['promoter_end'][i]
        
        enhancer_sequences.append(str(hg37[RefSeqDict[enhancer_seq_id]]
                                    .seq[enhancer_seq_start:enhancer_seq_end]).upper())

        promoter_sequences.append(str(hg37[RefSeqDict[promoter_seq_id]]
                                    .seq[promoter_seq_start:promoter_seq_end]).upper())

    ep_sequences = ep_pairs
    ep_sequences['enhancer_seq'] = enhancer_sequences
    ep_sequences['promoter_seq'] = promoter_sequences
    
    return ep_sequences

In [52]:
ep_sequences_K562 = load_ep_sequences('K562')
print("{} unique enhancers".format(len(set(ep_sequences_K562['enhancer_name']))))

Parsing GRCh37 genome...
Loading EP pairs for K562 dataset...
Getting DNA sequences for 1977 EP pairs...
1742 unique enhancers


In [53]:
ep_sequences_combined = load_ep_sequences('combined')
print("{} unique enhancers".format(len(set(ep_sequences_combined['enhancer_name']))))

Parsing GRCh37 genome...
Loading EP pairs for combined dataset...
Getting DNA sequences for 7084 EP pairs...
6493 unique enhancers


In [45]:
def compare_sequences(ep_sequences, seq_type, fasta_file):

    data = re.split(r'(^>.*)', ''.join(open(fasta_file).readlines()), flags=re.M)
    fids = []
    seqs = []
    for i in range(2, len(data), 2):
        fids.append(data[i-1][1:])
        seqs.append(data[i].replace('\n', '').replace(' ', ''))

    n_matches = 0

    for fid in fids:
        chrom = fid.split('_')[0]
        subseq = seqs[fids.index(fid)]

        match = ep_sequences.loc[ep_sequences[seq_type + '_seq'].str.contains(subseq)][seq_type + '_name'].values
        if len(match) > 0:
            n_matches = n_matches + len(match)
            for enh in match:
                print(fid, "->", enh)

    print("\n{} out of {} -> {:.2f}%\n".format(n_matches,len(fids), n_matches*100/len(fids)))

# US_UU DATASET (1,884 US=Promoters + 4,978 UU=Enhancers)

In [46]:
compare_sequences(ep_sequences_K562, 'enhancer', 'data/K562_UU.fa')

seq212 -> K562|chr1:47010272-47011240
seq313 -> K562|chr1:110305489-110306266
seq318 -> K562|chr1:110521957-110522800
seq366 -> K562|chr1:150585520-150586977
seq462 -> K562|chr1:181110472-181111800
seq549 -> K562|chr1:226189495-226191833
seq585 -> K562|chr1:235009928-235012400
seq926 -> K562|chr11:34392917-34393788
seq1008 -> K562|chr11:64765604-64766888
seq1218 -> K562|chr12:48569800-48570449
seq1963 -> K562|chr17:25928915-25930713
seq1963 -> K562|chr17:25928915-25930713
seq2107 -> K562|chr17:61510000-61511400
seq2408 -> K562|chr19:41713200-41713861
seq2442 -> K562|chr19:49164006-49164777
seq2526 -> K562|chr2:28806513-28807854
seq2527 -> K562|chr2:28806513-28807854
seq3155 -> K562|chr3:33832761-33834227
seq3300 -> K562|chr3:152768617-152770531
seq3374 -> K562|chr3:194838472-194839392
seq3459 -> K562|chr4:56595589-56597697
seq3875 -> K562|chr6:16004915-16006629
seq3892 -> K562|chr6:26024477-26026400
seq3904 -> K562|chr6:26318581-26321600
seq3904 -> K562|chr6:26318581-26321600
seq3939 -

In [47]:
compare_sequences(ep_sequences_K562, 'promoter', 'data/K562_US.fa')

seq28 -> K562|chr1:27113600-27115400
seq28 -> K562|chr1:27113600-27115400
seq28 -> K562|chr1:27113600-27115400
seq29 -> K562|chr1:27113600-27115400
seq29 -> K562|chr1:27113600-27115400
seq29 -> K562|chr1:27113600-27115400
seq62 -> K562|chr1:44678735-44679850
seq63 -> K562|chr1:45265274-45266912
seq63 -> K562|chr1:45265274-45266912
seq64 -> K562|chr1:45265274-45266912
seq64 -> K562|chr1:45265274-45266912
seq101 -> K562|chr1:110036332-110037374
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq103 -> K562|chr1:110527058-110528650
seq149 -> K562|chr1:174967666-174971619
seq149 -> K562|chr1:174967666-174971619
seq157 -> K562|chr1:181057130-181060638
seq157 -> K562|chr1:181057130-181

seq1250 -> K562|chr3:49208518-49210008
seq1250 -> K562|chr3:49208518-49210008
seq1250 -> K562|chr3:49208518-49210008
seq1262 -> K562|chr3:100053290-100054600
seq1309 -> K562|chr3:194304127-194305518
seq1309 -> K562|chr3:194304127-194305518
seq1377 -> K562|chr5:36874561-36879260
seq1377 -> K562|chr5:36874561-36879260
seq1377 -> K562|chr5:36874561-36879260
seq1378 -> K562|chr5:36874561-36879260
seq1378 -> K562|chr5:36874561-36879260
seq1378 -> K562|chr5:36874561-36879260
seq1382 -> K562|chr5:43039456-43043725
seq1382 -> K562|chr5:43039456-43043725
seq1387 -> K562|chr5:56203878-56206884
seq1419 -> K562|chr5:128429756-128431573
seq1438 -> K562|chr5:139943113-139945570
seq1438 -> K562|chr5:139943113-139945570
seq1474 -> K562|chr6:12007723-12011731
seq1492 -> K562|chr6:28109166-28110933
seq1492 -> K562|chr6:28109166-28110933
seq1493 -> K562|chr6:28109166-28110933
seq1493 -> K562|chr6:28109166-28110933
seq1506 -> K562|chr6:32934895-32942999
seq1506 -> K562|chr6:32934895-32942999
seq1506 -> K5

# CAGE DATASET (38,538 enhancers + 27,227 promoters)

In [49]:
compare_sequences(ep_sequences_combined, 'enhancer', 'data/fantom_enhancers_600.fa')

chr1:26616473-26617073 -> GM12878|chr1:26612919-26617752
chr1:26616955-26617555 -> GM12878|chr1:26612919-26617752
chr1:33564802-33565402 -> IMR90|chr1:33563000-33566200
chr1:85794588-85795188 -> HeLa-S3|chr1:85793617-85795292
chr1:85795773-85796373 -> HeLa-S3|chr1:85795384-85796995
chr1:181110567-181111167 -> K562|chr1:181110472-181111800
chr1:181110567-181111167 -> HeLa-S3|chr1:181110053-181111400
chr1:181110567-181111167 -> HeLa-S3|chr1:181110053-181111400
chr1:226189700-226190300 -> K562|chr1:226189495-226191833
chr1:226190332-226190932 -> K562|chr1:226189495-226191833
chr1:234630922-234631522 -> HeLa-S3|chr1:234630678-234631762
chr1:234630922-234631522 -> HeLa-S3|chr1:234630678-234631762
chr1:235106741-235107341 -> GM12878|chr1:235104658-235109008
chr10:3930139-3930739 -> IMR90|chr10:3929800-3930800
chr10:63697723-63698323 -> GM12878|chr10:63696838-63700670
chr10:63805635-63806235 -> GM12878|chr10:63802860-63807000
chr10:73649698-73650298 -> GM12878|chr10:73649600-73650345
chr10:89

In [50]:
compare_sequences(ep_sequences_combined, 'promoter', 'data/fantom_promoters_600.fa')

chr1:1711368-1711968 -> GM12878|chr1:1708321-1712200
chr1:2344868-2345468 -> GM12878|chr1:2343115-2345902
chr1:10003032-10003632 -> K562|chr1:10002482-10004387
chr1:23857112-23857712 -> K562|chr1:23855840-23858290
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:26146169-26146769 -> K562|chr1:26145209-26148000
chr1:27718750-27719350 -> K562|chr1:27718492-27719934
chr1:27718750-27719350 -> K562|chr1:27718492-27719934
chr1:28585614-28586214 -> HeLa-S3|chr1:28585604-28587012
chr1:28585614-28586214 -> HeLa-S3|chr1:28585604-28587012
chr1:28585614-28586214 -

chr1:153949685-153950285 -> K562|chr1:153948521-153951375
chr1:153949685-153950285 -> K562|chr1:153948521-153951375
chr1:153949685-153950285 -> K562|chr1:153948521-153951375
chr1:153949685-153950285 -> K562|chr1:153948521-153951375
chr1:153949685-153950285 -> K562|chr1:153948521-153951375
chr1:153949685-153950285 -> K562|chr1:153948521-153951375
chr1:153949685-153950285 -> K562|chr1:153948521-153951375
chr1:153949685-153950285 -> GM12878|chr1:153948386-153951068
chr1:153949685-153950285 -> GM12878|chr1:153948386-153951068
chr1:153949685-153950285 -> HeLa-S3|chr1:153949235-153950624
chr1:153958397-153958997 -> K562|chr1:153957804-153959600
chr1:153958397-153958997 -> K562|chr1:153957804-153959600
chr1:153958397-153958997 -> K562|chr1:153957804-153959600
chr1:153958397-153958997 -> K562|chr1:153957804-153959600
chr1:153958397-153958997 -> K562|chr1:153957804-153959600
chr1:153958397-153958997 -> K562|chr1:153957804-153959600
chr1:153958397-153958997 -> K562|chr1:153957804-153959600
chr1:

chr1:221915517-221916117 -> K562|chr1:221914045-221917076
chr1:221915517-221916117 -> K562|chr1:221914045-221917076
chr1:221915517-221916117 -> K562|chr1:221914045-221917076
chr1:221915517-221916117 -> K562|chr1:221914045-221917076
chr1:221915517-221916117 -> K562|chr1:221914045-221917076
chr1:224517464-224518064 -> K562|chr1:224516901-224518911
chr1:224517464-224518064 -> GM12878|chr1:224517175-224518774
chr1:224517464-224518064 -> GM12878|chr1:224517175-224518774
chr1:224517464-224518064 -> GM12878|chr1:224517175-224518774
chr1:225652639-225653239 -> HeLa-S3|chr1:225652634-225653496
chr1:225652639-225653239 -> HeLa-S3|chr1:225652634-225653496
chr1:225652639-225653239 -> HeLa-S3|chr1:225652634-225653496
chr1:225652639-225653239 -> HeLa-S3|chr1:225652634-225653496
chr1:225652639-225653239 -> HeLa-S3|chr1:225652634-225653496
chr1:226069661-226070261 -> HeLa-S3|chr1:226069491-226071175
chr1:226186769-226187369 -> K562|chr1:226185098-226188200
chr1:226186769-226187369 -> K562|chr1:2261850

chr11:20408750-20409350 -> GM12878|chr11:20408308-20410769
chr11:33182688-33183288 -> GM12878|chr11:33181678-33183846
chr11:33182688-33183288 -> GM12878|chr11:33181678-33183846
chr11:35160317-35160917 -> GM12878|chr11:35160280-35162541
chr11:35160317-35160917 -> GM12878|chr11:35160280-35162541
chr11:47198025-47198625 -> K562|chr11:47197242-47199399
chr11:47198025-47198625 -> HeLa-S3|chr11:47197448-47199416
chr11:57435055-57435655 -> GM12878|chr11:57434081-57437270
chr11:57435055-57435655 -> GM12878|chr11:57434081-57437270
chr11:57435055-57435655 -> GM12878|chr11:57434081-57437270
chr11:57508780-57509380 -> GM12878|chr11:57508350-57510480
chr11:57508780-57509380 -> GM12878|chr11:57508350-57510480
chr11:57508780-57509380 -> GM12878|chr11:57508350-57510480
chr11:58345757-58346357 -> GM12878|chr11:58344172-58347690
chr11:58345757-58346357 -> GM12878|chr11:58344172-58347690
chr11:58910074-58910674 -> GM12878|chr11:58909580-58911279
chr11:58910074-58910674 -> GM12878|chr11:58909580-58911279


chr12:9885350-9885950 -> GM12878|chr12:9885147-9886050
chr12:9885350-9885950 -> GM12878|chr12:9885147-9886050
chr12:13349195-13349795 -> HeLa-S3|chr12:13349189-13350673
chr12:13349195-13349795 -> HeLa-S3|chr12:13349189-13350673
chr12:13349195-13349795 -> HeLa-S3|chr12:13349189-13350673
chr12:25101712-25102312 -> GM12878|chr12:25100540-25102891
chr12:25101712-25102312 -> GM12878|chr12:25100540-25102891
chr12:32908075-32908675 -> GM12878|chr12:32907100-32909319
chr12:32908075-32908675 -> GM12878|chr12:32907100-32909319
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12:32908075-32908675 -> HeLa-S3|chr12:32907908-32909440
chr12

chr14:24898413-24899013 -> GM12878|chr14:24898401-24901080
chr14:24898413-24899013 -> GM12878|chr14:24898401-24901080
chr14:24898413-24899013 -> GM12878|chr14:24898401-24901080
chr14:24898413-24899013 -> GM12878|chr14:24898401-24901080
chr14:24898413-24899013 -> GM12878|chr14:24898401-24901080
chr14:35872204-35872804 -> GM12878|chr14:35869528-35875314
chr14:35872204-35872804 -> GM12878|chr14:35869528-35875314
chr14:35872204-35872804 -> GM12878|chr14:35869528-35875314
chr14:35872204-35872804 -> GM12878|chr14:35869528-35875314
chr14:36295188-36295788 -> K562|chr14:36295129-36296456
chr14:50086885-50087485 -> K562|chr14:50085982-50089077
chr14:50086885-50087485 -> K562|chr14:50085982-50089077
chr14:50086885-50087485 -> K562|chr14:50085982-50089077
chr14:67826386-67826986 -> GM12878|chr14:67825673-67827866
chr14:68086234-68086834 -> GM12878|chr14:68086151-68086847
chr14:68086234-68086834 -> GM12878|chr14:68086151-68086847
chr14:68162066-68162666 -> HeLa-S3|chr14:68161466-68162680
chr14:692

chr16:53088520-53089120 -> GM12878|chr16:53088284-53090593
chr16:53088520-53089120 -> GM12878|chr16:53088284-53090593
chr16:53088520-53089120 -> GM12878|chr16:53088284-53090593
chr16:53088520-53089120 -> GM12878|chr16:53088284-53090593
chr16:53088520-53089120 -> GM12878|chr16:53088284-53090593
chr16:53088520-53089120 -> GM12878|chr16:53088284-53090593
chr16:53088520-53089120 -> GM12878|chr16:53088284-53090593
chr16:58034968-58035568 -> K562|chr16:58033763-58036044
chr16:58034968-58035568 -> K562|chr16:58033763-58036044
chr16:67184480-67185080 -> GM12878|chr16:67184232-67185336
chr16:67184480-67185080 -> GM12878|chr16:67184232-67185336
chr16:67184480-67185080 -> GM12878|chr16:67184232-67185336
chr16:67969422-67970022 -> HeLa-S3|chr16:67969390-67971033
chr16:67969422-67970022 -> HeLa-S3|chr16:67969390-67971033
chr16:67969422-67970022 -> HeLa-S3|chr16:67969390-67971033
chr16:67969422-67970022 -> HeLa-S3|chr16:67969390-67971033
chr16:67970340-67970940 -> HeLa-S3|chr16:67969390-67971033
chr

chr17:55927111-55927711 -> HeLa-S3|chr17:55927061-55928114
chr17:58212621-58213221 -> K562|chr17:58211786-58213261
chr17:58212621-58213221 -> K562|chr17:58211786-58213261
chr17:61926262-61926862 -> K562|chr17:61926122-61927673
chr17:61926262-61926862 -> K562|chr17:61926122-61927673
chr17:61926262-61926862 -> K562|chr17:61926122-61927673
chr17:61926262-61926862 -> K562|chr17:61926122-61927673
chr17:61926262-61926862 -> K562|chr17:61926122-61927673
chr17:65713596-65714196 -> HeLa-S3|chr17:65713378-65714794
chr17:65713596-65714196 -> HeLa-S3|chr17:65713378-65714794
chr17:65713596-65714196 -> HeLa-S3|chr17:65713378-65714794
chr17:65713596-65714196 -> HeLa-S3|chr17:65713378-65714794
chr17:65713596-65714196 -> HeLa-S3|chr17:65713378-65714794
chr17:70026441-70027041 -> GM12878|chr17:70025784-70027188
chr17:70026441-70027041 -> GM12878|chr17:70025784-70027188
chr17:70026441-70027041 -> GM12878|chr17:70025784-70027188
chr17:70026441-70027041 -> GM12878|chr17:70025784-70027188
chr17:70026441-700

chr19:38397485-38398085 -> K562|chr19:38397041-38398551
chr19:38397485-38398085 -> K562|chr19:38397041-38398551
chr19:38397485-38398085 -> K562|chr19:38397041-38398551
chr19:38397485-38398085 -> K562|chr19:38397041-38398551
chr19:39108251-39108851 -> GM12878|chr19:39106621-39110908
chr19:39108251-39108851 -> GM12878|chr19:39106621-39110908
chr19:39108251-39108851 -> GM12878|chr19:39106621-39110908
chr19:39108251-39108851 -> GM12878|chr19:39106621-39110908
chr19:39108251-39108851 -> GM12878|chr19:39106621-39110908
chr19:39108251-39108851 -> GM12878|chr19:39106621-39110908
chr19:39109449-39110049 -> GM12878|chr19:39106621-39110908
chr19:39109449-39110049 -> GM12878|chr19:39106621-39110908
chr19:39109449-39110049 -> GM12878|chr19:39106621-39110908
chr19:39109449-39110049 -> GM12878|chr19:39106621-39110908
chr19:39109449-39110049 -> GM12878|chr19:39106621-39110908
chr19:39109449-39110049 -> GM12878|chr19:39106621-39110908
chr19:39421104-39421704 -> K562|chr19:39420267-39422707
chr19:394211

chr2:70520399-70520999 -> K562|chr2:70518239-70522297
chr2:70520399-70520999 -> K562|chr2:70518239-70522297
chr2:70520399-70520999 -> GM12878|chr2:70519527-70521716
chr2:70520399-70520999 -> HeLa-S3|chr2:70520217-70521424
chr2:70520399-70520999 -> HeLa-S3|chr2:70520217-70521424
chr2:70520399-70520999 -> HeLa-S3|chr2:70520217-70521424
chr2:70528135-70528735 -> K562|chr2:70528017-70529331
chr2:70528135-70528735 -> K562|chr2:70528017-70529331
chr2:70528135-70528735 -> GM12878|chr2:70527906-70529253
chr2:74708922-74709522 -> GM12878|chr2:74708756-74710921
chr2:74708922-74709522 -> GM12878|chr2:74708756-74710921
chr2:74708922-74709522 -> GM12878|chr2:74708756-74710921
chr2:74708922-74709522 -> GM12878|chr2:74708756-74710921
chr2:74708922-74709522 -> GM12878|chr2:74708756-74710921
chr2:74709622-74710222 -> K562|chr2:74708967-74710896
chr2:74709622-74710222 -> K562|chr2:74708967-74710896
chr2:74709622-74710222 -> GM12878|chr2:74708756-74710921
chr2:74709622-74710222 -> GM12878|chr2:74708756-7

chr21:15755191-15755791 -> GM12878|chr21:15754679-15756638
chr21:18983671-18984271 -> GM12878|chr21:18983233-18985631
chr21:18984799-18985399 -> GM12878|chr21:18983233-18985631
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:26933841-26934441 -> GM12878|chr21:26932577-26941272
chr21:27107212-27107812 -> GM12878|chr21:27105875-27108935
chr21:27107212-27107812 -> GM12878|chr21:27105875-27108935
chr21:28216303-28216903 -> HeLa-S3|chr21:28215273-28217854
chr21:35287697-35288297 -> K562|chr21:35286712-35288672
chr21:35287697-35288297 -> K562|chr21:35286712-35288672
chr

chr3:152879580-152880180 -> K562|chr3:152878188-152882249
chr3:152879580-152880180 -> K562|chr3:152878188-152882249
chr3:152879580-152880180 -> HeLa-S3|chr3:152878167-152881369
chr3:152879580-152880180 -> HeLa-S3|chr3:152878167-152881369
chr3:152879580-152880180 -> HeLa-S3|chr3:152878167-152881369
chr3:152879580-152880180 -> HeLa-S3|chr3:152878167-152881369
chr3:152879580-152880180 -> HeLa-S3|chr3:152878167-152881369
chr3:156391882-156392482 -> HeLa-S3|chr3:156391711-156394087
chr3:156391882-156392482 -> HeLa-S3|chr3:156391711-156394087
chr3:156392956-156393556 -> HeLa-S3|chr3:156391711-156394087
chr3:156392956-156393556 -> HeLa-S3|chr3:156391711-156394087
chr3:171177711-171178311 -> GM12878|chr3:171174937-171179839
chr3:171177711-171178311 -> GM12878|chr3:171174937-171179839
chr3:171177711-171178311 -> GM12878|chr3:171174937-171179839
chr3:171177711-171178311 -> GM12878|chr3:171174937-171179839
chr3:171177711-171178311 -> GM12878|chr3:171174937-171179839
chr3:171177711-171178311 -> GM

chr5:90676482-90677082 -> K562|chr5:90675176-90680499
chr5:90676482-90677082 -> GM12878|chr5:90675256-90680173
chr5:90676482-90677082 -> GM12878|chr5:90675256-90680173
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:93953770-93954370 -> HeLa-S3|chr5:93953510-93955100
chr5:102455554-102456154 -> GM12878|chr5:102454623-102456958
chr5:108084007-108084607 -> GM12878|chr5:108083352-108085623
chr5:108084007-108084607 -> GM12878|chr5:108083352-108085623
chr5:108084007-108084607 -> GM12878|chr5:108083352-108085623
chr5:108744944-108

chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> K562|chr6:27095563-27104310
chr6:27100431-27101031 -> GM12878|chr6:27097712-27102463
chr6:27100431-27101031 -> GM12878|chr6:27097712-27102463
chr6:27100431-27101031 -> GM12878|chr6:27097712-27102463
chr6:27113394-27113994 -> K562|chr6:27108613-27116400
chr6:27113394-27113994 -> K562|chr6:27108613-27116400
chr6:27113394-27113994 -> K562|chr6:27108613-27116400
chr6:27113394-27113994 -> K562|chr6:27108613-27116400
chr6:27113394-27113994 -> K562|chr6:27108613-27116400
chr6:27113394-27113

chr6:52528725-52529325 -> K562|chr6:52528581-52530355
chr6:52528725-52529325 -> K562|chr6:52528581-52530355
chr6:52926172-52926772 -> K562|chr6:52925975-52927012
chr6:52926172-52926772 -> K562|chr6:52925975-52927012
chr6:52926172-52926772 -> K562|chr6:52925975-52927012
chr6:52926172-52926772 -> K562|chr6:52925975-52927012
chr6:52926172-52926772 -> GM12878|chr6:52926000-52926983
chr6:52926172-52926772 -> GM12878|chr6:52926000-52926983
chr6:52926172-52926772 -> GM12878|chr6:52926000-52926983
chr6:52926172-52926772 -> HeLa-S3|chr6:52926023-52926932
chr6:52926172-52926772 -> HeLa-S3|chr6:52926023-52926932
chr6:52926172-52926772 -> HeLa-S3|chr6:52926023-52926932
chr6:52926172-52926772 -> HeLa-S3|chr6:52926023-52926932
chr6:69345436-69346036 -> K562|chr6:69344241-69346229
chr6:69345436-69346036 -> K562|chr6:69344241-69346229
chr6:69345436-69346036 -> K562|chr6:69344241-69346229
chr6:82462013-82462613 -> HeLa-S3|chr6:82461009-82463801
chr6:82462013-82462613 -> HeLa-S3|chr6:82461009-82463801
c

chr7:20370008-20370608 -> HeLa-S3|chr7:20369225-20371897
chr7:20370008-20370608 -> HeLa-S3|chr7:20369225-20371897
chr7:20370008-20370608 -> HeLa-S3|chr7:20369225-20371897
chr7:20370008-20370608 -> HeLa-S3|chr7:20369225-20371897
chr7:20370008-20370608 -> HeLa-S3|chr7:20369225-20371897
chr7:22766764-22767364 -> HeLa-S3|chr7:22765989-22767720
chr7:22766764-22767364 -> HeLa-S3|chr7:22765989-22767720
chr7:22766764-22767364 -> HeLa-S3|chr7:22765989-22767720
chr7:22766764-22767364 -> HeLa-S3|chr7:22765989-22767720
chr7:22766764-22767364 -> HeLa-S3|chr7:22765989-22767720
chr7:22766764-22767364 -> HeLa-S3|chr7:22765989-22767720
chr7:25019311-25019911 -> GM12878|chr7:25018124-25021781
chr7:25019311-25019911 -> GM12878|chr7:25018124-25021781
chr7:25021091-25021691 -> GM12878|chr7:25018124-25021781
chr7:25021091-25021691 -> GM12878|chr7:25018124-25021781
chr7:26240441-26241041 -> GM12878|chr7:26237952-26243008
chr7:26240441-26241041 -> GM12878|chr7:26237952-26243008
chr7:26240441-26241041 -> GM128

chr8:128807233-128807833 -> GM12878|chr8:128806385-128811141
chr8:128807233-128807833 -> GM12878|chr8:128806385-128811141
chr8:128807233-128807833 -> GM12878|chr8:128806385-128811141
chr8:128807233-128807833 -> HeLa-S3|chr8:128806281-128807858
chr8:128807233-128807833 -> HeLa-S3|chr8:128806281-128807858
chr8:142139609-142140209 -> GM12878|chr8:142138654-142140620
chr8:142139609-142140209 -> GM12878|chr8:142138654-142140620
chr8:142139609-142140209 -> GM12878|chr8:142138654-142140620
chr8:142139609-142140209 -> GM12878|chr8:142138654-142140620
chr8:142139609-142140209 -> GM12878|chr8:142138654-142140620
chr9:2014892-2015492 -> GM12878|chr9:2014104-2019470
chr9:2016830-2017430 -> GM12878|chr9:2014104-2019470
chr9:4984706-4985306 -> K562|chr9:4983982-4987609
chr9:4984706-4985306 -> K562|chr9:4983982-4987609
chr9:4984706-4985306 -> K562|chr9:4983982-4987609
chr9:4984706-4985306 -> K562|chr9:4983982-4987609
chr9:4984706-4985306 -> K562|chr9:4983982-4987609
chr9:4984706-4985306 -> GM12878|ch

chrX:30595476-30596076 -> GM12878|chrX:30594785-30596239
chrX:30595476-30596076 -> GM12878|chrX:30594785-30596239
chrX:30595476-30596076 -> GM12878|chrX:30594785-30596239
chrX:40506358-40506958 -> K562|chrX:40505805-40507076
chrX:40506358-40506958 -> GM12878|chrX:40505471-40507396
chrX:40506358-40506958 -> GM12878|chrX:40505471-40507396
chrX:40506358-40506958 -> GM12878|chrX:40505471-40507396
chrX:41193879-41194479 -> GM12878|chrX:41191657-41196030
chrX:41193879-41194479 -> GM12878|chrX:41191657-41196030
chrX:41193879-41194479 -> GM12878|chrX:41191657-41196030
chrX:47050555-47051155 -> GM12878|chrX:47050000-47051779
chrX:47050555-47051155 -> GM12878|chrX:47050000-47051779
chrX:47050555-47051155 -> GM12878|chrX:47050000-47051779
chrX:77150553-77151153 -> HeLa-S3|chrX:77150449-77151809
chrX:77150553-77151153 -> HeLa-S3|chrX:77150449-77151809
chrX:77150553-77151153 -> HeLa-S3|chrX:77150449-77151809
chrX:77150553-77151153 -> HeLa-S3|chrX:77150449-77151809
chrX:78200502-78201102 -> GM12878|

## iEnhancer-2L DATASET (STRONG, WEAK, NON-ENHANCERS)

In [54]:
def compare_enhancers(ep_sequences, enh_class):

    data = re.split(r'(^>.*)', ''.join(open('data/{}_enhancers.txt'.format(enh_class)).readlines()), flags=re.M)
    fids = []
    seqs = []
    for i in range(2, len(data), 2):
        fids.append(data[i-1][1:])
        seqs.append(data[i].replace('\n', '').replace(' ', ''))

    n_matches = 0

    for fid in fids:
        chrom = fid.split('_')[0]
        subseq = seqs[fids.index(fid)]

        match = ep_sequences.loc[ep_sequences['enhancer_seq'].str.contains(subseq)]['enhancer_name'].values
        if len(match) > 0:
            n_matches = n_matches + len(match)
            for enh in match:
                print(fid, "->", enh)

    print("\n{} enhancer\n{} out of {} -> {:.2f}%\n".format(enh_class, n_matches,
                                                            len(fids), n_matches*100/len(fids)))

In [55]:
compare_enhancers(ep_sequences_combined, 'strong')

CHR16_87812299_87812499 -> K562|chr16:87812132-87813140
CHR6_26025821_26026021 -> K562|chr6:26024477-26026400
CHR1_45274813_45275013 -> HeLa-S3|chr1:45274200-45276600
CHR1_150540176_150540376 -> K562|chr1:150539800-150541416
CHR1_150540376_150540576 -> K562|chr1:150539800-150541416
CHR1_150540576_150540776 -> K562|chr1:150539800-150541416
CHR16_87812499_87812699 -> K562|chr16:87812132-87813140
CHR16_87812899_87813099 -> K562|chr16:87812132-87813140
CHR6_26026021_26026221 -> K562|chr6:26024477-26026400
CHR1_150541176_150541376 -> K562|chr1:150539800-150541416
CHR12_14410733_14410933 -> IMR90|chr12:14409200-14411800
CHR6_26025621_26025821 -> K562|chr6:26024477-26026400
CHR6_33558022_33558222 -> K562|chr6:33557769-33558288
CHR1_150540976_150541176 -> K562|chr1:150539800-150541416
CHR17_62224468_62224668 -> K562|chr17:62223600-62225407
CHR17_62224668_62224868 -> K562|chr17:62223600-62225407
CHR16_87812699_87812899 -> K562|chr16:87812132-87813140
CHR1_150539976_150540176 -> K562|chr1:150539

In [56]:
compare_enhancers(ep_sequences_combined, 'weak')


weak enhancer
0 out of 742 -> 0.00%



In [57]:
compare_enhancers(ep_sequences_combined, 'non')


non enhancer
0 out of 1484 -> 0.00%

