**[iEnhancer-2L](https://academic.oup.com/bioinformatics/article/32/3/362/1744331)** ➡ H1ES, **K562**, **GM12878**, HepG2, **HUVEC**, HSMM, NHLF, **NHEK**, HMEC

**[TargetFinder](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4910881/)** ➡ **GM12878**, **K562**, IMR90, HeLa-S3, **HUVEC**, **NHEK**

In [25]:
import os
import sys
import random
import re
import numpy as np
import pandas as pd
import pickle as pkl
from IPython.display import display
from Bio import SeqIO

CELL_LINE = 'combined'

In [2]:
ep_pairs = pd.read_csv('https://raw.githubusercontent.com/shwhalen/' \
                               'targetfinder/master/paper/targetfinder/{}/' \
                               'output-ep/pairs.csv'.format(CELL_LINE))

In [85]:
ep_pairs = ep_pairs[ep_pairs['label'] == 1].reset_index(drop=True)
ep_pairs = ep_pairs[['enhancer_name', 'enhancer_chrom', 'enhancer_start', 'enhancer_end',
                    'promoter_name', 'promoter_chrom', 'promoter_start', 'promoter_end', 'label']]

In [23]:
def getSequences(ep_pairs):
    # DOWNLOAD HUMAN GENOME v37 (3.2 Gb)
    # Older version but compatible with genomic coordinates of TargetFinder dataset
    # https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml
    # https://github.com/shwhalen/targetfinder/tree/master/paper/targetfinder

    print('Parsing GRCh37 genome...')
    hg37 = SeqIO.to_dict(SeqIO.parse('data/GRCh37_latest_genomic.fna', 'fasta'))

    RefSeqIDs = []

    for k in hg37.keys():
        if k.startswith('NC_0000'):
            RefSeqIDs.append(hg37[k].id)

    chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', \
               'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', \
               'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

    RefSeqDict = {chromosomes[i]: RefSeqIDs[i] for i in range(len(chromosomes))}

    enhancer_sequences = []
    promoter_sequences = []
    n = len(ep_pairs)

    print('Getting DNA sequences for {} EP pairs...'.format(n))

    for i in range(n):
        enhancer_seq_id = ep_pairs['enhancer_chrom'][i]
        enhancer_seq_start = ep_pairs['enhancer_start'][i] - 1
        enhancer_seq_end = ep_pairs['enhancer_end'][i]

        promoter_seq_id = ep_pairs['promoter_chrom'][i]
        promoter_seq_start = ep_pairs['promoter_start'][i] - 1
        promoter_seq_end = ep_pairs['promoter_end'][i]
        
        enhancer_sequences.append(str(hg37[RefSeqDict[enhancer_seq_id]]
                                    .seq[enhancer_seq_start:enhancer_seq_end]).upper())

        promoter_sequences.append(str(hg37[RefSeqDict[promoter_seq_id]]
                                    .seq[promoter_seq_start:promoter_seq_end]).upper())

    ep_sequences = ep_pairs
    ep_sequences['enhancer_seq'] = enhancer_sequences
    ep_sequences['promoter_seq'] = promoter_sequences
    
    return ep_sequences

In [26]:
ep_sequences = getSequences(ep_pairs)

Parsing GRCh37 genome...
Getting DNA sequences for 7084 EP pairs...


In [27]:
display(ep_sequences)

print("{} unique enhancers".format(len(set(ep_sequences['enhancer_name']))))

Unnamed: 0,enhancer_name,enhancer_chrom,enhancer_start,enhancer_end,promoter_name,promoter_chrom,promoter_start,promoter_end,label,enhancer_seq,promoter_seq
0,K562|chr1:6454864-6455189,chr1,6454864,6455189,K562|chr1:6613082-6615021,chr1,6613082,6615021,1,AGGATGGTCTTGATCTCCTGACCTTGTGATCTGCCCGCCTCTGCCT...,GGCTGGTCTCAAACTCCTGACCTCAGATGATCTGCCAGCTTCGGCT...
1,K562|chr1:9935400-9935544,chr1,9935400,9935544,K562|chr1:10002482-10004387,chr1,10002482,10004387,1,GGTGCTTCCCAGAGGAGCGTCAGTTCCATATCAAAGACTGCTTCCA...,TCTGAATACATTTCAGTAAAAATGGGTGCATTTTAAAAGCCTTAAC...
2,K562|chr1:9935400-9935544,chr1,9935400,9935544,K562|chr1:10092884-10095822,chr1,10092884,10095822,1,GGTGCTTCCCAGAGGAGCGTCAGTTCCATATCAAAGACTGCTTCCA...,GCGCGGGACGGGGTGGGACGGAGGCGGGGAGAGGACGCAGGCGAGA...
3,K562|chr1:16399567-16400081,chr1,16399567,16400081,K562|chr1:16677563-16679703,chr1,16677563,16679703,1,CGCAGTTCCTCCGGGAAGCCAGGAAGAACCAGAGCTGGTTCCCAGG...,ATGGTACTTCCCCCAAATTTTCCTTGCCTCTTTAAATCCATGATGT...
4,K562|chr1:16400449-16400658,chr1,16400449,16400658,K562|chr1:16677563-16679703,chr1,16677563,16679703,1,TGGGCCTATTTTTAGAGGATGAAAGTGAGGCTCGCAGAAGTTAGAG...,ATGGTACTTCCCCCAAATTTTCCTTGCCTCTTTAAATCCATGATGT...
...,...,...,...,...,...,...,...,...,...,...,...
7079,IMR90|chrX:17609600-17610400,chrX,17609600,17610400,IMR90|chrX:17393000-17393600,chrX,17393000,17393600,1,TCTCTATTATTTCCATGGTATTTGGTATGAAGACAGAATCCTCCAC...,AAAACTTGCTCCAACACGCCAAATCTGCCGCATCCTCAATACCCCA...
7080,IMR90|chrX:23919400-23919600,chrX,23919400,23919600,IMR90|chrX:23801000-23801800,chrX,23801000,23801800,1,CACCTTCTTGCCCTGGAGGCTTCAGTTAGCAGCTGATGATTTCCCA...,ACCTTCTTTCATTTGCAAATGTTACTGGGGGACACACCGGCTCCCA...
7081,IMR90|chrX:47383600-47383800,chrX,47383600,47383800,IMR90|chrX:47050000-47050600,chrX,47050000,47050600,1,CTGCCGGGCATCAATACCCCAACTCACCCAGCACCCCCTCTTCAGA...,CTCAGGCTTACATTTTCCCATATTCTGGACGCTGGTGCCCACCACG...
7082,IMR90|chrX:109083200-109084400,chrX,109083200,109084400,IMR90|chrX:108976400-108976800,chrX,108976400,108976800,1,AAGAGCCACATTTAAGTTGTCCTACACTTTGAAGTCATGAGGAGAG...,GCCGGCGCCTGGCACTCGGAAAGCTCGCAAAAAGGAACCGCGTGCC...


6493 unique enhancers


In [81]:
def compareEnhancers(ep_sequences, enh_class):

    data = re.split(r'(^>.*)', ''.join(open('data/{}_enhancers.txt'.format(enh_class)).readlines()), flags=re.M)
    fids = []
    seqs = []
    for i in range(2, len(data), 2):
        fids.append(data[i-1][1:])
        seqs.append(data[i].replace('\n', '').replace(' ', ''))

    n_matches = 0

    for fid in fids:
        chrom = fid.split('_')[0]
        subseq = seqs[fids.index(fid)]

        match = ep_sequences.loc[(ep_sequences['enhancer_chrom'].str.upper() == chrom) & 
                                 (ep_sequences['enhancer_seq'].str.contains(subseq))]['enhancer_name'].values
        if len(match) > 0:
            n_matches = n_matches + len(match)
            for enh in match:
                print(fid, "->", enh)

    print("\n{} enhancer\n{} out of {} -> {:.2f}%\n".format(enh_class, n_matches,
                                                            len(fids), n_matches*100/len(fids)))

In [82]:
compareEnhancers(ep_sequences, 'strong')

CHR16_87812299_87812499 -> K562|chr16:87812132-87813140
CHR6_26025821_26026021 -> K562|chr6:26024477-26026400
CHR1_45274813_45275013 -> HeLa-S3|chr1:45274200-45276600
CHR1_150540176_150540376 -> K562|chr1:150539800-150541416
CHR1_150540376_150540576 -> K562|chr1:150539800-150541416
CHR1_150540576_150540776 -> K562|chr1:150539800-150541416
CHR16_87812499_87812699 -> K562|chr16:87812132-87813140
CHR16_87812899_87813099 -> K562|chr16:87812132-87813140
CHR6_26026021_26026221 -> K562|chr6:26024477-26026400
CHR1_150541176_150541376 -> K562|chr1:150539800-150541416
CHR12_14410733_14410933 -> IMR90|chr12:14409200-14411800
CHR6_26025621_26025821 -> K562|chr6:26024477-26026400
CHR6_33558022_33558222 -> K562|chr6:33557769-33558288
CHR1_150540976_150541176 -> K562|chr1:150539800-150541416
CHR17_62224468_62224668 -> K562|chr17:62223600-62225407
CHR17_62224668_62224868 -> K562|chr17:62223600-62225407
CHR16_87812699_87812899 -> K562|chr16:87812132-87813140
CHR1_150539976_150540176 -> K562|chr1:150539

In [83]:
compareEnhancers(ep_sequences, 'weak')


weak enhancer
0 out of 742 -> 0.00%



In [84]:
compareEnhancers(ep_sequences, 'non')


non enhancer
0 out of 1484 -> 0.00%

