In [30]:
from Bio import Entrez, SeqIO
from IPython.display import clear_output
import pandas as pd

CELL_LINE = 'GM12878'
K_MER = 5
TEST_EID = 981 # arbitrary enhancer id to be used tests

# Get enhancer-promoter pairs

https://github.com/shwhalen/targetfinder/blob/master/paper/targetfinder/GM12878/output-ep/pairs.csv

In [31]:
ep_pairs = pd.read_csv('data/{}/ep_pairs.csv'.format(CELL_LINE))
print(len(ep_pairs))
ep_pairs.head()

44313


Unnamed: 0,bin,enhancer_chrom,enhancer_distance_to_promoter,enhancer_end,enhancer_name,enhancer_start,label,promoter_chrom,promoter_end,promoter_name,promoter_start,window_end,window_start,window_chrom,window_name,interactions_in_window,active_promoters_in_window
0,"[22271, 83570.8]",chr1,60682,9686400,GM12878|chr1:9685722-9686400,9685722,1,chr1,9749721,GM12878|chr1:9747084-9749721,9747084,9747083,9686401,chr1,GM12878|chr1:9686401-9747083,1,1
1,"[22271, 83570.8]",chr1,56866,24136600,GM12878|chr1:24136556-24136600,24136556,1,chr1,24194871,GM12878|chr1:24193468-24194871,24193468,24193467,24136601,chr1,GM12878|chr1:24136601-24193467,4,1
2,"[22271, 83570.8]",chr1,56534,24136932,GM12878|chr1:24136600-24136932,24136600,1,chr1,24194871,GM12878|chr1:24193468-24194871,24193468,24193467,24136933,chr1,GM12878|chr1:24136933-24193467,3,1
3,"[22271, 83570.8]",chr1,55591,24137875,GM12878|chr1:24137625-24137875,24137625,1,chr1,24194871,GM12878|chr1:24193468-24194871,24193468,24193467,24137876,chr1,GM12878|chr1:24137876-24193467,2,1
4,"[22271, 83570.8]",chr1,54052,24139414,GM12878|chr1:24139145-24139414,24139145,1,chr1,24194871,GM12878|chr1:24193468-24194871,24193468,24193467,24139415,chr1,GM12878|chr1:24139415-24193467,1,1


## Remove distance-matched non-interacting enhancer-promoter pairs (where label=0)

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4910881/#S14title

 ![S1](figs/S1.png)

In [19]:
ep_pairs = ep_pairs[ep_pairs['label'] == 1]
print(len(ep_pairs))

2113


# Fetch genome sequences

https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml

https://www.ncbi.nlm.nih.gov/books/NBK50679/#RefSeqFAQ.what_causes_the_version_number

https://www.ncbi.nlm.nih.gov/nuccore/NC_000001?report=docsum

https://www.ncbi.nlm.nih.gov/refseq/

In [20]:
# DOWNLOAD HUMAN GENOME v37 (3.2 Gb)
# Older version but compatible with genomic coordinates of TargetFinder dataset
# https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml
# https://github.com/shwhalen/targetfinder/tree/master/paper/targetfinder

hg37 = SeqIO.to_dict(SeqIO.parse('data/GRCh37_latest_genomic.fna', 'fasta'))

In [32]:
RefSeqIDs = []

for k in hg37.keys():
    if k.startswith('NC_0000'):
        print('{}\nSequence length: {}\n'.format(hg37[k], len(hg37[k].seq)))
        RefSeqIDs.append(hg37[k].id)

ID: NC_000001.10
Name: NC_000001.10
Description: NC_000001.10 Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN')
Sequence length: 249250621

ID: NC_000002.11
Name: NC_000002.11
Description: NC_000002.11 Homo sapiens chromosome 2, GRCh37.p13 Primary Assembly
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN')
Sequence length: 243199373

ID: NC_000003.11
Name: NC_000003.11
Description: NC_000003.11 Homo sapiens chromosome 3, GRCh37.p13 Primary Assembly
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN')
Sequence length: 198022430

ID: NC_000004.11
Name: NC_000004.11
Description: NC_000004.11 Homo sapiens chromosome 4, GRCh37.p13 Primary Assembly
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN')
Sequence length: 191154276

ID: NC_000005.9
Name: NC_000005.9
Description: NC_000005.9 H

In [22]:
chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', \
               'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', \
               'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

RefSeqDict = {chromosomes[i]: RefSeqIDs[i] for i in range(len(chromosomes))}

RefSeqDict

{'chr1': 'NC_000001.10',
 'chr2': 'NC_000002.11',
 'chr3': 'NC_000003.11',
 'chr4': 'NC_000004.11',
 'chr5': 'NC_000005.9',
 'chr6': 'NC_000006.11',
 'chr7': 'NC_000007.13',
 'chr8': 'NC_000008.10',
 'chr9': 'NC_000009.11',
 'chr10': 'NC_000010.10',
 'chr11': 'NC_000011.9',
 'chr12': 'NC_000012.11',
 'chr13': 'NC_000013.10',
 'chr14': 'NC_000014.8',
 'chr15': 'NC_000015.9',
 'chr16': 'NC_000016.9',
 'chr17': 'NC_000017.10',
 'chr18': 'NC_000018.9',
 'chr19': 'NC_000019.9',
 'chr20': 'NC_000020.10',
 'chr21': 'NC_000021.8',
 'chr22': 'NC_000022.10',
 'chrX': 'NC_000023.10',
 'chrY': 'NC_000024.9'}

# Get DNA sequences for all enhancer-promoter pairs

In [23]:
enhancer_sequences = []
promoter_sequences = []
n = len(ep_pairs)

for i in range(n):
    enhancer_seq_id = ep_pairs['enhancer_chrom'][i]
    enhancer_seq_start = ep_pairs['enhancer_start'][i] - 1
    enhancer_seq_end = ep_pairs['enhancer_end'][i]
    
    promoter_seq_id = ep_pairs['promoter_chrom'][i]
    promoter_seq_start = ep_pairs['promoter_start'][i] - 1
    promoter_seq_end = ep_pairs['promoter_end'][i]
    
    enhancer_sequences.append(str(hg37[RefSeqDict[enhancer_seq_id]]
                                  .seq[enhancer_seq_start:enhancer_seq_end]).upper())
    
    promoter_sequences.append(str(hg37[RefSeqDict[promoter_seq_id]]
                                  .seq[promoter_seq_start:promoter_seq_end]).upper())

    if i % 10 == 0 or i == n-1:
        clear_output(wait=True)
        print('{} / {}'.format(i+1, n))

df_ep_seq = pd.DataFrame({'enhancer_name': ep_pairs['enhancer_name'][0:n],
                          'promoter_name': ep_pairs['promoter_name'][0:n],
                          'enhancer_seq': enhancer_sequences,
                          'promoter_seq': promoter_sequences})

df_ep_seq.to_csv('data/{}/ep_sequences.csv'.format(CELL_LINE), index=False)
print('EP sequences are written!')

2113 / 2113
EP sequences are written!


In [34]:
df_ep_seq = pd.read_csv('data/{}/ep_sequences.csv'.format(CELL_LINE))
df_ep_seq.head()

Unnamed: 0,enhancer_name,promoter_name,enhancer_seq,promoter_seq
0,GM12878|chr1:9685722-9686400,GM12878|chr1:9747084-9749721,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,TTTTGCCATTTCAAAGAATCTTGGATTTTTCTCTGGGCTCCAGAGA...
1,GM12878|chr1:24136556-24136600,GM12878|chr1:24193468-24194871,GTGGCAACTGAGGCTAAGACCTGGAGCAGGGCAGCTGCTCTCAAG,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
2,GM12878|chr1:24136600-24136932,GM12878|chr1:24193468-24194871,GAAACAGTTGCTACTGTTACCATTCCACCTATCTGGATGCCACAAA...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
3,GM12878|chr1:24137625-24137875,GM12878|chr1:24193468-24194871,GTGCCAGAGGAGCTGGGGCCAGTACTCCAAAAGGAGACCAAAGACT...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
4,GM12878|chr1:24139145-24139414,GM12878|chr1:24193468-24194871,GCCCAGAGGCAAGAGTGGAGGCATGTGACAAACAGAAAGAAGTTCC...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...


In [33]:
print('{} unique enhancers'.format(len(set(df_ep_seq['enhancer_name']))))
print('{}  unique promoters'.format(len(set(df_ep_seq['promoter_name']))))

1932 unique enhancers
736  unique promoters


# Convert DNA sequences to sentences of k-mers

https://github.com/ma-compbio/PEP/blob/30cbe77d8bddf88309d70c8f4f0e04974da4e02b/processSeq.py

In [26]:
def DNA2Sentence(dna, K, clean=False):

    if clean:
        dna = dna.replace("N", "")

    sentence = ""
    length = len(dna)

    for i in range(length - K + 1):
        sentence += dna[i: i + K] + " "

    # remove spaces
    sentence = sentence[0 : len(sentence) - 1]
    return sentence

# TEST
DNA2Sentence("AAGGTTNCC", k_mer, clean=True)

'AAGGT AGGTT GGTTC GTTCC'

In [27]:
enhancer_sentences = []
promoter_sentences = []
for i in range(len(df_ep_seq)):
    enhancer_sentences.append(DNA2Sentence(df_ep_seq['enhancer_seq'][i], k_mer))
    promoter_sentences.append(DNA2Sentence(df_ep_seq['promoter_seq'][i], k_mer))

    if i % 10 == 0 or i == n-1:
        clear_output(wait=True)
        print('{} / {}'.format(i+1, n))
    
df_ep_sentences = pd.DataFrame({'enhancer_name': df_ep_seq['enhancer_name'][0:n],
                                'promoter_name': df_ep_seq['promoter_name'][0:n],
                                'enhancer_sentence': enhancer_sentences,
                                'promoter_sentence': promoter_sentences})

df_ep_sentences.to_csv('data/{}/ep_sentences_{}mer.csv'.format(CELL_LINE, K_MER), index=False)
print('EP sentences are written!')

2113 / 2113
EP sentences are written!


In [35]:
df_ep_sentences = pd.read_csv('data/{}/ep_sentences_{}mer.csv'.format(CELL_LINE, K_MER))
df_ep_sentences.head()

Unnamed: 0,enhancer_name,promoter_name,enhancer_sentence,promoter_sentence
0,GM12878|chr1:9685722-9686400,GM12878|chr1:9747084-9749721,TGACA GACAG ACAGG CAGGC AGGCA GGCAT GCATG CATG...,TTTTG TTTGC TTGCC TGCCA GCCAT CCATT CATTT ATTT...
1,GM12878|chr1:24136556-24136600,GM12878|chr1:24193468-24194871,GTGGC TGGCA GGCAA GCAAC CAACT AACTG ACTGA CTGA...,TGAAT GAATT AATTC ATTCA TTCAA TCAAA CAAAG AAAG...
2,GM12878|chr1:24136600-24136932,GM12878|chr1:24193468-24194871,GAAAC AAACA AACAG ACAGT CAGTT AGTTG GTTGC TTGC...,TGAAT GAATT AATTC ATTCA TTCAA TCAAA CAAAG AAAG...
3,GM12878|chr1:24137625-24137875,GM12878|chr1:24193468-24194871,GTGCC TGCCA GCCAG CCAGA CAGAG AGAGG GAGGA AGGA...,TGAAT GAATT AATTC ATTCA TTCAA TCAAA CAAAG AAAG...
4,GM12878|chr1:24139145-24139414,GM12878|chr1:24193468-24194871,GCCCA CCCAG CCAGA CAGAG AGAGG GAGGC AGGCA GGCA...,TGAAT GAATT AATTC ATTCA TTCAA TCAAA CAAAG AAAG...


In [29]:
# prepare_gcn_data.ipynb