**[iEnhancer-2L](https://academic.oup.com/bioinformatics/article/32/3/362/1744331)** ➡ H1ES, **K562**, **GM12878**, HepG2, **HUVEC**, HSMM, NHLF, **NHEK**, HMEC

**[TargetFinder](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4910881/)** ➡ **GM12878**, **K562**, IMR90, HeLa-S3, **HUVEC**, **NHEK**

**[Xin Zeng (2021)](https://www.frontiersin.org/articles/10.3389/fgene.2021.681259/full#B4)** ➡ **K562** (sequence length=500 in US_UU dataset) and [CAGE](https://academic.oup.com/genetics/article/211/4/1205/5931517) (sequence length=600 in CAGE dataset)

In [95]:
import os
import sys
import random
import re
import numpy as np
import pandas as pd
import pickle as pkl
from IPython.display import display
from Bio import SeqIO
from tqdm.notebook import tqdm

In [37]:
def load_ep_pairs(cell_line):

    ep_pairs = pd.read_csv('https://raw.githubusercontent.com/shwhalen/' \
                                   'targetfinder/master/paper/targetfinder/{}/' \
                                   'output-ep/pairs.csv'.format(cell_line))
    ep_pairs = ep_pairs[ep_pairs['label'] == 1].reset_index(drop=True)
    ep_pairs = ep_pairs[['enhancer_name', 'enhancer_chrom', 'enhancer_start', 'enhancer_end',
                        'promoter_name', 'promoter_chrom', 'promoter_start', 'promoter_end', 'label']]
    return ep_pairs

In [40]:
def load_ep_sequences(cell_line):
    # DOWNLOAD HUMAN GENOME v37 (3.2 Gb)
    # Older version but compatible with genomic coordinates of TargetFinder dataset
    # https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml
    # https://github.com/shwhalen/targetfinder/tree/master/paper/targetfinder

    print('Parsing GRCh37 genome...')
    hg37 = SeqIO.to_dict(SeqIO.parse('data/GRCh37_latest_genomic.fna', 'fasta'))

    RefSeqIDs = []

    for k in hg37.keys():
        if k.startswith('NC_0000'):
            RefSeqIDs.append(hg37[k].id)

    chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', \
               'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', \
               'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

    RefSeqDict = {chromosomes[i]: RefSeqIDs[i] for i in range(len(chromosomes))}

    enhancer_sequences = []
    promoter_sequences = []
    
    print('Loading EP pairs for {} dataset...'.format(cell_line))
    ep_pairs = load_ep_pairs(cell_line)
    
    n = len(ep_pairs)

    print('Getting DNA sequences for {} EP pairs...'.format(n))

    for i in range(n):
        enhancer_seq_id = ep_pairs['enhancer_chrom'][i]
        enhancer_seq_start = ep_pairs['enhancer_start'][i] - 1
        enhancer_seq_end = ep_pairs['enhancer_end'][i]

        promoter_seq_id = ep_pairs['promoter_chrom'][i]
        promoter_seq_start = ep_pairs['promoter_start'][i] - 1
        promoter_seq_end = ep_pairs['promoter_end'][i]
        
        enhancer_sequences.append(str(hg37[RefSeqDict[enhancer_seq_id]]
                                    .seq[enhancer_seq_start:enhancer_seq_end]).upper())

        promoter_sequences.append(str(hg37[RefSeqDict[promoter_seq_id]]
                                    .seq[promoter_seq_start:promoter_seq_end]).upper())

    ep_sequences = ep_pairs
    ep_sequences['enhancer_seq'] = enhancer_sequences
    ep_sequences['promoter_seq'] = promoter_sequences
    
    return ep_sequences

In [52]:
ep_sequences_K562 = load_ep_sequences('K562')
print("{} unique enhancers".format(len(set(ep_sequences_K562['enhancer_name']))))

Parsing GRCh37 genome...
Loading EP pairs for K562 dataset...
Getting DNA sequences for 1977 EP pairs...
1742 unique enhancers


In [53]:
ep_sequences_combined = load_ep_sequences('combined')
print("{} unique enhancers".format(len(set(ep_sequences_combined['enhancer_name']))))

Parsing GRCh37 genome...
Loading EP pairs for combined dataset...
Getting DNA sequences for 7084 EP pairs...
6493 unique enhancers


In [132]:
def compare_sequences(ep_sequences, seq_type, fasta_file):

    data = re.split(r'(^>.*)', ''.join(open(fasta_file).readlines()), flags=re.M)
    fids = []
    seqs = []
    for i in range(2, len(data), 2):
        fids.append(data[i-1][1:])
        seqs.append(data[i].replace('\n', '').replace(' ', '').upper())

    ep_seq = ep_sequences.drop_duplicates(subset = seq_type + '_name', keep='first').reset_index()
    
    n_matches = 0


    for fid in tqdm(fids):
        subseq = seqs[fids.index(fid)]

        match = ep_seq.loc[ep_seq[seq_type + '_seq'].str.contains(subseq)][seq_type + '_name']

        if len(match) > 0:
            n_matches = n_matches + 1 # len(match.values)
            # for enh in match.values:
                # print(fid, "->", enh)

    print("{} out of {} -> {:.2f}%".format(n_matches,len(fids), n_matches*100/len(fids)))

# US_UU DATASET (1,884 US=Promoters + 4,978 UU=Enhancers)

In [128]:
compare_sequences(ep_sequences_K562, 'enhancer', 'data/compare_seq/K562_UU.fa')

HBox(children=(FloatProgress(value=0.0, max=4978.0), HTML(value='')))


37 out of 4978 -> 0.74%


In [129]:
compare_sequences(ep_sequences_K562, 'promoter', 'data/compare_seq/K562_US.fa')

HBox(children=(FloatProgress(value=0.0, max=1884.0), HTML(value='')))


113 out of 1884 -> 6.00%


# CAGE dataset (38,538 enhancers + 27,227 promoters)

In [130]:
compare_sequences(ep_sequences_combined, 'enhancer', 'data/compare_seq/fantom_enhancers_600.fa')

HBox(children=(FloatProgress(value=0.0, max=38537.0), HTML(value='')))


237 out of 38537 -> 0.61%


In [133]:
compare_sequences(ep_sequences_combined, 'promoter', 'data/compare_seq/fantom_promoters_600.fa')

HBox(children=(FloatProgress(value=0.0, max=27227.0), HTML(value='')))


1357 out of 27227 -> 4.98%


## iEnhancer-2L dataset (strong, weak, and non-enhancers)

In [131]:
def compare_enhancers(ep_sequences, enh_class):

    data = re.split(r'(^>.*)', ''.join(open('data/compare_seq/{}_enhancers.txt'.format(enh_class)).readlines()), flags=re.M)
    fids = []
    seqs = []
    for i in range(2, len(data), 2):
        fids.append(data[i-1][1:])
        seqs.append(data[i].replace('\n', '').replace(' ', '').upper())

    ep_seq = ep_sequences.drop_duplicates(subset = 'enhancer_name', keep='first').reset_index()

    n_matches = 0

    for fid in tqdm(fids):
        subseq = seqs[fids.index(fid)]

        match = ep_seq.loc[ep_seq['enhancer_seq'].str.contains(subseq)]['enhancer_name'].values
        if len(match) > 0:
            n_matches = n_matches + 1 # len(match)
            # for enh in match:
                # print(fid, "->", enh)

    print("{} enhancer\n{} out of {} -> {:.2f}%".format(enh_class, n_matches,
                                                            len(fids), n_matches*100/len(fids)))

In [122]:
compare_enhancers(ep_sequences_combined, 'strong')

HBox(children=(FloatProgress(value=0.0, max=742.0), HTML(value='')))


strong enhancer
18 out of 742 -> 2.43%


In [123]:
compare_enhancers(ep_sequences_combined, 'weak')

HBox(children=(FloatProgress(value=0.0, max=742.0), HTML(value='')))


weak enhancer
0 out of 742 -> 0.00%


In [124]:
compare_enhancers(ep_sequences_combined, 'non')

HBox(children=(FloatProgress(value=0.0, max=1484.0), HTML(value='')))


non enhancer
0 out of 1484 -> 0.00%
