In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import pickle as pkl
import scipy.sparse as sp
import pcdhit
from Bio import SeqIO
from IPython.display import display
import statistics as st

CELL_LINE = 'GM12878'

In [2]:
def getPairs(cell_line):
    """
        If your cell line is not available in TargetFinder repo,
        Place your ep_pairs.csv file manually under your cell line directory.
    """
    available_cell_lines = ['GM12878', 'HUVEC', 'HeLa-S3', 'IMR90', 'K562', 'NHEK', 'combined']

    if cell_line not in available_cell_lines:
        print('{} cell line is not in available.\nSelect one of {}\n' \
              'Or manually create gcn/data/{}/ep_pairs.csv'.format(cell_line, available_cell_lines, cell_line))
        return None

    if os.path.isfile('gcn/data/{}/ep_pairs.csv'.format(cell_line)):
        print('Reading pairs from local file...')
        ep_pairs = pd.read_csv('gcn/data/{}/ep_pairs.csv'.format(cell_line))
    else:
        print('Reading pairs from remote github repo...')
        ep_pairs = pd.read_csv('https://raw.githubusercontent.com/shwhalen/' \
                               'targetfinder/master/paper/targetfinder/{}/' \
                               'output-ep/pairs.csv'.format(cell_line))
        if not os.path.isdir('gcn/data/{}'.format(cell_line)):
            print('Creating directory for {} cell line...'.format(cell_line))
            os.makedirs('gcn/data/{}'.format(cell_line))
        print('Writing pairs to data/{}/ep_pairs.csv'.format(cell_line))
        ep_pairs.to_csv('gcn/data/{}/ep_pairs.csv'.format(cell_line), index=False)
    return ep_pairs

In [3]:
def getSequences(ep_pairs, hg):
    RefSeqIDs = []

    for k in hg37.keys():
        if k.startswith('NC_0000'):
            RefSeqIDs.append(hg37[k].id)

    chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', \
               'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', \
               'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

    RefSeqDict = {chromosomes[i]: RefSeqIDs[i] for i in range(len(chromosomes))}

    enhancer_sequences = []
    promoter_sequences = []
    n = len(ep_pairs)

    print('Getting DNA sequences for {} EP pairs...'.format(n))

    for i in range(n):
        enhancer_seq_id = ep_pairs['enhancer_chrom'][i]
        enhancer_seq_start = ep_pairs['enhancer_start'][i] - 1
        enhancer_seq_end = ep_pairs['enhancer_end'][i]

        promoter_seq_id = ep_pairs['promoter_chrom'][i]
        promoter_seq_start = ep_pairs['promoter_start'][i] - 1
        promoter_seq_end = ep_pairs['promoter_end'][i]
        
        enhancer_sequences.append(str(hg37[RefSeqDict[enhancer_seq_id]]
                                    .seq[enhancer_seq_start:enhancer_seq_end]).upper())

        promoter_sequences.append(str(hg37[RefSeqDict[promoter_seq_id]]
                                    .seq[promoter_seq_start:promoter_seq_end]).upper())

    ep_sequences = pd.DataFrame({'enhancer_name': ep_pairs['enhancer_name'][0:n],
                                 'promoter_name': ep_pairs['promoter_name'][0:n],
                                 'enhancer_seq': enhancer_sequences,
                                 'promoter_seq': promoter_sequences})
    return ep_sequences

In [4]:
# DOWNLOAD HUMAN GENOME v37 (3.2 Gb)
# Older version but compatible with genomic coordinates of TargetFinder dataset
# https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml
# https://github.com/shwhalen/targetfinder/tree/master/paper/targetfinder

try:
    hg37
except NameError:
    print('Parsing GRCh37 genome...')
    hg37 = SeqIO.to_dict(SeqIO.parse('gcn/data/GRCh37_latest_genomic.fna', 'fasta'))

Parsing GRCh37 genome...


In [5]:
ep_pairs = getPairs(CELL_LINE)
print('{} EP pairs have been read.'.format(len(ep_pairs)))

ep_pairs = ep_pairs[ep_pairs['label'] == 1].reset_index() # Keep only the interacting pairs
print('{} EP pairs are labeled as 1.'.format(len(ep_pairs)))

ep_sequences = getSequences(ep_pairs, hg37)

Reading pairs from local file...
44313 EP pairs have been read.
2113 EP pairs are labeled as 1.
Getting DNA sequences for 2113 EP pairs...


In [6]:
display(ep_sequences.head())

Unnamed: 0,enhancer_name,promoter_name,enhancer_seq,promoter_seq
0,GM12878|chr1:9685722-9686400,GM12878|chr1:9747084-9749721,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,TTTTGCCATTTCAAAGAATCTTGGATTTTTCTCTGGGCTCCAGAGA...
1,GM12878|chr1:24136556-24136600,GM12878|chr1:24193468-24194871,GTGGCAACTGAGGCTAAGACCTGGAGCAGGGCAGCTGCTCTCAAG,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
2,GM12878|chr1:24136600-24136932,GM12878|chr1:24193468-24194871,GAAACAGTTGCTACTGTTACCATTCCACCTATCTGGATGCCACAAA...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
3,GM12878|chr1:24137625-24137875,GM12878|chr1:24193468-24194871,GTGCCAGAGGAGCTGGGGCCAGTACTCCAAAAGGAGACCAAAGACT...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
4,GM12878|chr1:24139145-24139414,GM12878|chr1:24193468-24194871,GCCCAGAGGCAAGAGTGGAGGCATGTGACAAACAGAAAGAAGTTCC...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...


In [223]:
ep_sequences = ep_sequences[
    ep_sequences['enhancer_seq'].apply(lambda x: len(x)>=200) &
    ep_sequences['promoter_seq'].apply(lambda x: len(x)>=200)].reset_index(drop=True)
display(ep_sequences)
print(len(set(ep_sequences['enhancer_name'])), "enhancers with length >= 200")
print(len(set(ep_sequences['promoter_name'])), "enhancers with length >= 200")

Unnamed: 0,enhancer_name,promoter_name,enhancer_seq,promoter_seq
0,GM12878|chr1:9685722-9686400,GM12878|chr1:9747084-9749721,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,TTTTGCCATTTCAAAGAATCTTGGATTTTTCTCTGGGCTCCAGAGA...
1,GM12878|chr1:24136600-24136932,GM12878|chr1:24193468-24194871,GAAACAGTTGCTACTGTTACCATTCCACCTATCTGGATGCCACAAA...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
2,GM12878|chr1:24137625-24137875,GM12878|chr1:24193468-24194871,GTGCCAGAGGAGCTGGGGCCAGTACTCCAAAAGGAGACCAAAGACT...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
3,GM12878|chr1:24139145-24139414,GM12878|chr1:24193468-24194871,GCCCAGAGGCAAGAGTGGAGGCATGTGACAAACAGAAAGAAGTTCC...,TGAATTCAAAGTTCAAGAGAAACGAAAACCCGGAAGATGGCTGAGG...
4,GM12878|chr1:26611045-26611600,GM12878|chr1:26643652-26646530,TTGCCTAAACACAGGCCAAGGGTATGGTCTAATGCAACCCCCATTT...,AACCCAGGAGGTGGAGGTTGCAGTAAGCCGAGATCATGCCAGTGCA...
...,...,...,...,...
1333,GM12878|chrX:56757000-56757200,GM12878|chrX:55743852-55745220,GGCCTTAAAGGTTGAGTGACATCTGCCATGTGGAGGGGCAGGGAAT...,TAAAAAGCAGGAAGGCAAAGACAATGCTCGGTCTTCCCAGTCAAAA...
1334,GM12878|chrX:78514149-78515721,GM12878|chrX:78200389-78203116,GGACCTATTACTGTAACTATCTCATAGATCAAAAAATTGAGGTAAA...,GGGATAATAAATATACAAAAATAATGAAACTATGTGTTAAATTGTG...
1335,GM12878|chrX:78518965-78519600,GM12878|chrX:78200389-78203116,TCCCAGGCAATTATTGTTCTTCTCGTGCTGCCTATTCACACTAGGC...,GGGATAATAAATATACAAAAATAATGAAACTATGTGTTAAATTGTG...
1336,GM12878|chrX:153601435-153601725,GM12878|chrX:153199645-153200493,GCGCCTGGGCGTCTGGGAGCTGGGGTGGAGGTGGGAGGGAGGGCTG...,CTATTCTCTGCTCCCTTCGGGCGCCTTCTTTTCCTTTGCCCCTGTC...


1224 enhancers with length >= 200
618 enhancers with length >= 200


### FIX THE SEQUENCE LENGTHS

See iEnhancer-2L paper [ https://academic.oup.com/bioinformatics/article/32/3/362/1744331 ]

In [205]:
# Divide enhancers and promoters into 200bp DNA fragments
# Remove fragments with length < 200bp
# Remove fragments with high sequence similarity (cutoff threshold at 80%)

In [251]:
enh_names = []
enh_frag_names = []
enh_frag_seqs = []
for i in range(len(ep_sequences)):
    seq = ep_sequences['enhancer_seq'][i]
    name = ep_sequences['enhancer_name'][i]
    coordinates = name.split(':')[1]
    coor_start = int(coordinates.split('-')[0])
    coor_end = coor_start + 200
    while len(seq) >= 200:
        fragment = str(coor_start) + '-' + str(coor_end)
        enh_names.append(name)
        enh_frag_names.append(name.split(':')[0] + ':' + fragment)
        enh_frag_seqs.append(seq[:200])
        seq = seq[200:]
        coor_start = coor_end
        coor_end = coor_start + 200

pro_names = []
pro_frag_names = []
pro_frag_seqs = []
for i in range(len(ep_sequences)):
    seq = ep_sequences['promoter_seq'][i]
    name = ep_sequences['promoter_name'][i]
    coordinates = name.split(':')[1]
    coor_start = int(coordinates.split('-')[0])
    coor_end = coor_start + 200
    while len(seq) >= 200:
        fragment = str(coor_start) + '-' + str(coor_end)
        pro_names.append(name)
        pro_frag_names.append(name.split(':')[0] + ':' + fragment)
        pro_frag_seqs.append(seq[:200])
        seq = seq[200:]
        coor_start = coor_end
        coor_end = coor_start + 200

df_enh_fragments = pd.DataFrame({'enhancer_name': enh_names, 'enhancer_frag_name': enh_frag_names, 'enhancer_frag_seq': enh_frag_seqs})
df_pro_fragments = pd.DataFrame({'promoter_name': pro_names, 'promoter_frag_name': pro_frag_names, 'promoter_frag_seq': pro_frag_seqs})

df_enh_fragments = df_enh_fragments.drop_duplicates(subset=['enhancer_frag_name']).reset_index(drop=True)
df_pro_fragments = df_pro_fragments.drop_duplicates(subset=['promoter_frag_name']).reset_index(drop=True)

display(df_enh_fragments.head())
display(df_pro_fragments.head())
print(len(df_enh_fragments), 'fragments from', len(set(df_enh_fragments['enhancer_name'])), 'enhancers')
print(len(df_pro_fragments), 'fragments from', len(set(df_pro_fragments['promoter_name'])), 'promoters')

Unnamed: 0,enhancer_name,enhancer_frag_name,enhancer_frag_seq
0,GM12878|chr1:9685722-9686400,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...
1,GM12878|chr1:9685722-9686400,GM12878|chr1:9685922-9686122,TGGAAGTCACCCAGGAGGGCGACCTGCTGGGAGGTATCAGAGCCTG...
2,GM12878|chr1:9685722-9686400,GM12878|chr1:9686122-9686322,TTACCACTGTGCAAGATGTTAGGAATGTACTTGTTTGTTTCTGTAT...
3,GM12878|chr1:24136600-24136932,GM12878|chr1:24136600-24136800,GAAACAGTTGCTACTGTTACCATTCCACCTATCTGGATGCCACAAA...
4,GM12878|chr1:24137625-24137875,GM12878|chr1:24137625-24137825,GTGCCAGAGGAGCTGGGGCCAGTACTCCAAAAGGAGACCAAAGACT...


Unnamed: 0,promoter_name,promoter_frag_name,promoter_frag_seq
0,GM12878|chr1:9747084-9749721,GM12878|chr1:9747084-9747284,TTTTGCCATTTCAAAGAATCTTGGATTTTTCTCTGGGCTCCAGAGA...
1,GM12878|chr1:9747084-9749721,GM12878|chr1:9747284-9747484,AGGGAGGAGGGCTCCCACACGTGGGGAATGGGGGTGGAGGAACCGC...
2,GM12878|chr1:9747084-9749721,GM12878|chr1:9747484-9747684,TGGGGAAATCCCCGCCCTGGGGGGGTCCGGTGTGGCCTGCAGGGAG...
3,GM12878|chr1:9747084-9749721,GM12878|chr1:9747684-9747884,CTCCGCCCTCTCCCGGGATCTGTGAAAGCACAGCCAGGGTGCGGGC...
4,GM12878|chr1:9747084-9749721,GM12878|chr1:9747884-9748084,CTTCTAGGGACGGCCAGGGAAGAGGAGTGTCCCCTCCCACCCTGGC...


4453 fragments from 1224 enhancers
8175 fragments from 618 promoters


### Run CD-HIT to remove highly similar sequences to get rid of redundancy and avoid bias

Download and install both required libraries given below:

https://github.com/weizhongli/cdhit

https://github.com/simomarsili/pcdhit

In [259]:
filtered_enh_fragments = list(pcdhit.filter(list(zip(df_enh_fragments['enhancer_frag_name'],
                                                     df_enh_fragments['enhancer_frag_seq'])), threshold=0.8))

Program: CD-HIT, V4.8.1, Aug 31 2021, 12:28:44
Command: /usr/local/bin/cd-hit -i
         /var/folders/r5/7x1ckhy12931jpqx_kpngkzc0000gn/T/tmpzh9gioat
         -o
         /var/folders/r5/7x1ckhy12931jpqx_kpngkzc0000gn/T/tmp150z6jyv
         -c 0.8

Started: Sun Sep  5 13:21:14 2021
                            Output                              
----------------------------------------------------------------
total seq: 4453
longest and shortest : 200 and 200
Total letters: 890600
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 1 X 10M = 10M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 77M

Table limit with the given memory limit:
Max number of representatives: 2406577
Max number of word counting entries: 90306813

comparing sequences from          0  to       4453
....
     4453  finished       4385  clusters

Approximated maximum memory consumption: 83M
writing new database
writing clustering infor

In [242]:
filtered_pro_fragments = list(pcdhit.filter(list(zip(df_pro_fragments['promoter_frag_name'],
                                                     df_pro_fragments['promoter_frag_seq'])), threshold=0.8))

Program: CD-HIT, V4.8.1, Aug 31 2021, 12:28:44
Command: /usr/local/bin/cd-hit -i
         /var/folders/r5/7x1ckhy12931jpqx_kpngkzc0000gn/T/tmp37vbl9w4
         -o
         /var/folders/r5/7x1ckhy12931jpqx_kpngkzc0000gn/T/tmpov7s9bmi
         -c 0.8

Started: Sun Sep  5 12:27:11 2021
                            Output                              
----------------------------------------------------------------
total seq: 8175
longest and shortest : 200 and 200
Total letters: 1635000
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 2M
Buffer          : 1 X 10M = 10M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 78M

Table limit with the given memory limit:
Max number of representatives: 2401946
Max number of word counting entries: 90133060

comparing sequences from          0  to       8175
........
     8175  finished       8089  clusters

Approximated maximum memory consumption: 89M
writing new database
writing clustering 

In [277]:
df_fef = df_enh_fragments[df_enh_fragments.iloc[:,2].isin([e[1] for e in filtered_enh_fragments])].reset_index(drop=True)
df_fpf = df_pro_fragments[df_pro_fragments.iloc[:,2].isin([p[1] for p in filtered_pro_fragments])].reset_index(drop=True)

display(df_fef.head())
display(df_fpf.head())

print(len(set(df_fef['enhancer_frag_name'])), 'enhancer fragments with low similarity')
print(len(set(df_fpf['promoter_frag_name'])), 'promoter fragments with low similarity')

Unnamed: 0,enhancer_name,enhancer_frag_name,enhancer_frag_seq
0,GM12878|chr1:9685722-9686400,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...
1,GM12878|chr1:9685722-9686400,GM12878|chr1:9685922-9686122,TGGAAGTCACCCAGGAGGGCGACCTGCTGGGAGGTATCAGAGCCTG...
2,GM12878|chr1:9685722-9686400,GM12878|chr1:9686122-9686322,TTACCACTGTGCAAGATGTTAGGAATGTACTTGTTTGTTTCTGTAT...
3,GM12878|chr1:24136600-24136932,GM12878|chr1:24136600-24136800,GAAACAGTTGCTACTGTTACCATTCCACCTATCTGGATGCCACAAA...
4,GM12878|chr1:24137625-24137875,GM12878|chr1:24137625-24137825,GTGCCAGAGGAGCTGGGGCCAGTACTCCAAAAGGAGACCAAAGACT...


Unnamed: 0,promoter_name,promoter_frag_name,promoter_frag_seq
0,GM12878|chr1:9747084-9749721,GM12878|chr1:9747084-9747284,TTTTGCCATTTCAAAGAATCTTGGATTTTTCTCTGGGCTCCAGAGA...
1,GM12878|chr1:9747084-9749721,GM12878|chr1:9747284-9747484,AGGGAGGAGGGCTCCCACACGTGGGGAATGGGGGTGGAGGAACCGC...
2,GM12878|chr1:9747084-9749721,GM12878|chr1:9747484-9747684,TGGGGAAATCCCCGCCCTGGGGGGGTCCGGTGTGGCCTGCAGGGAG...
3,GM12878|chr1:9747084-9749721,GM12878|chr1:9747684-9747884,CTCCGCCCTCTCCCGGGATCTGTGAAAGCACAGCCAGGGTGCGGGC...
4,GM12878|chr1:9747084-9749721,GM12878|chr1:9747884-9748084,CTTCTAGGGACGGCCAGGGAAGAGGAGTGTCCCCTCCCACCCTGGC...


4385 enhancer fragments with low similarity
8089 promoter fragments with low similarity


# Append interaction information

In [264]:
from IPython.display import clear_output

col_names =  ['enhancer_name', 'enhancer_frag_name', 'enhancer_frag_seq',
              'promoter_name', 'promoter_frag_name', 'promoter_frag_seq']

merged_df = pd.DataFrame(columns = col_names)

for i in range(len(ep_sequences)):
    enh_frags = df_fef[df_fef['enhancer_name'] == ep_sequences['enhancer_name'][i]]
    pro_frags = df_fpf[df_fpf['promoter_name'] == ep_sequences['promoter_name'][i]]
    
    clear_output(wait=True)
    print(i+1, "of", len(ep_sequences))
    
    for e in range(len(enh_frags)):
        for p in range(len(pro_frags)):
            e_row = enh_frags[e:e+1].reset_index(drop=True)
            p_row = pro_frags[p:p+1].reset_index(drop=True)
            merged_row = pd.concat([e_row, p_row], axis=1)
            merged_df = pd.concat([merged_df, merged_row])

merged_df = merged_df.reset_index(drop=True)

1338 of 1338


In [275]:
display(merged_df.head())


print(len(set(merged_df['enhancer_name'])), " enhancers")
print(len(set(merged_df['promoter_name'])), "  promoters")
print(len(merged_df), "interactions between EP fragments")

Unnamed: 0,enhancer_name,enhancer_frag_name,enhancer_frag_seq,promoter_name,promoter_frag_name,promoter_frag_seq
0,GM12878|chr1:9685722-9686400,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747084-9749721,GM12878|chr1:9747084-9747284,TTTTGCCATTTCAAAGAATCTTGGATTTTTCTCTGGGCTCCAGAGA...
1,GM12878|chr1:9685722-9686400,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747084-9749721,GM12878|chr1:9747284-9747484,AGGGAGGAGGGCTCCCACACGTGGGGAATGGGGGTGGAGGAACCGC...
2,GM12878|chr1:9685722-9686400,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747084-9749721,GM12878|chr1:9747484-9747684,TGGGGAAATCCCCGCCCTGGGGGGGTCCGGTGTGGCCTGCAGGGAG...
3,GM12878|chr1:9685722-9686400,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747084-9749721,GM12878|chr1:9747684-9747884,CTCCGCCCTCTCCCGGGATCTGTGAAAGCACAGCCAGGGTGCGGGC...
4,GM12878|chr1:9685722-9686400,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747084-9749721,GM12878|chr1:9747884-9748084,CTTCTAGGGACGGCCAGGGAAGAGGAGTGTCCCCTCCCACCCTGGC...


1217  enhancers
614   promoters
78216 interactions between EP fragments


In [324]:
df_temp = merged_df[['enhancer_frag_name', 'enhancer_frag_seq', 'promoter_frag_name', 'promoter_frag_seq']]
df_temp.columns = ['enhancer_name', 'enhancer_seq', 'promoter_name', 'promoter_seq']
display(df_temp)

print(len(set(df_temp['enhancer_name'])), 'enhancer fragments')
print(len(set(df_temp['promoter_name'])), 'promoter fragments')

Unnamed: 0,enhancer_name,enhancer_seq,promoter_name,promoter_seq
0,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747084-9747284,TTTTGCCATTTCAAAGAATCTTGGATTTTTCTCTGGGCTCCAGAGA...
1,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747284-9747484,AGGGAGGAGGGCTCCCACACGTGGGGAATGGGGGTGGAGGAACCGC...
2,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747484-9747684,TGGGGAAATCCCCGCCCTGGGGGGGTCCGGTGTGGCCTGCAGGGAG...
3,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747684-9747884,CTCCGCCCTCTCCCGGGATCTGTGAAAGCACAGCCAGGGTGCGGGC...
4,GM12878|chr1:9685722-9685922,TGACAGGCATGAGCCACCACGCCCGGCAGATTTTTCAAGATATAAT...,GM12878|chr1:9747884-9748084,CTTCTAGGGACGGCCAGGGAAGAGGAGTGTCCCCTCCCACCCTGGC...
...,...,...,...,...
78211,GM12878|chrX:153601435-153601635,GCGCCTGGGCGTCTGGGAGCTGGGGTGGAGGTGGGAGGGAGGGCTG...,GM12878|chrX:153200245-153200445,CCGGGCTCGGCCGGGGCCCTCGGGAGCATGCGCGGCAGCCACCCGG...
78212,GM12878|chrX:153602144-153602344,GGTCGCCCATTCCCAAGCTCCCACCTTGACGGTGCACTCGGAGCAC...,GM12878|chrX:153199645-153199845,CTATTCTCTGCTCCCTTCGGGCGCCTTCTTTTCCTTTGCCCCTGTC...
78213,GM12878|chrX:153602144-153602344,GGTCGCCCATTCCCAAGCTCCCACCTTGACGGTGCACTCGGAGCAC...,GM12878|chrX:153199845-153200045,GCCATGGTAGAAGTAGTATTTCATCTGGTAGTTCTCGGGCAGGCAG...
78214,GM12878|chrX:153602144-153602344,GGTCGCCCATTCCCAAGCTCCCACCTTGACGGTGCACTCGGAGCAC...,GM12878|chrX:153200045-153200245,GTGACGATCTGACTTTGCACAACGCCCGGAGCACAGGAACAGGAGT...


4385 enhancer fragments
8051 promoter fragments


# BALANCE ENH & PRO FRAGMENTS

In [336]:
from collections import Counter
 
most_freq_promoters = [p[0] for p in Counter(df_temp['promoter_name']).most_common(3189)]
df_temp_balanced = df_temp[df_temp['promoter_name'].isin(most_freq_promoters)].reset_index(drop=True)

print(len(set(df_temp_balanced['enhancer_name'])))
print(len(set(df_temp_balanced['promoter_name'])))
print(len(df_temp_balanced))

df_temp_balanced.to_csv('gcn/data/{}/frag_pairs_balanced.csv'.format(CELL_LINE), index=False)

3189
3189
62161


# TRAIN (cv) and TEST (ind) SPLIT

In [326]:
from sklearn.model_selection import train_test_split

df_enh_frags = df_temp_balanced.drop_duplicates(subset=['enhancer_name'])[['enhancer_name', 'enhancer_seq']].reset_index(drop=True)
df_pro_frags = df_temp_balanced.drop_duplicates(subset=['promoter_name'])[['promoter_name', 'promoter_seq']].reset_index(drop=True)

df_enh_cv, df_enh_ind = train_test_split(df_enh_frags, test_size=0.1, random_state=42)
df_pro_cv, df_pro_ind = train_test_split(df_pro_frags, test_size=0.1, random_state=42)

df_enh_cv = df_enh_cv.reset_index(drop=True)
df_enh_ind = df_enh_ind.reset_index(drop=True)
df_pro_cv = df_pro_cv.reset_index(drop=True)
df_pro_ind = df_pro_ind.reset_index(drop=True)

print(len(df_enh_cv), 'train fragments and', len(df_enh_ind), 'test fragments')

2870 train fragments and 319 test fragments


# WRITE AS FASTA FILES (for bert-enhancer)

In [337]:
lines = []
for i in range(len(df_enh_cv)):
    lines.append('>' + df_enh_cv['enhancer_name'][i])
    lines.append(df_enh_cv['enhancer_seq'][i])

open('bert/data/enhancer.cv.txt', 'w').close()

with open('bert/data/enhancer.cv.txt', 'w') as f:
    f.write('\n'.join(lines))

In [338]:
lines = []
for i in range(len(df_pro_cv)):
    lines.append('>' + df_pro_cv['promoter_name'][i])
    lines.append(df_pro_cv['promoter_seq'][i])

open('bert/data/promoter.cv.txt', 'w').close()

with open('bert/data/promoter.cv.txt', 'w') as f:
    f.write('\n'.join(lines))

In [339]:
lines = []
for i in range(len(df_enh_ind)):
    lines.append('>' + df_enh_ind['enhancer_name'][i])
    lines.append(df_enh_ind['enhancer_seq'][i])

open('bert/data/enhancer.ind.txt', 'w').close()

with open('bert/data/enhancer.ind.txt', 'w') as f:
    f.write('\n'.join(lines))

In [340]:
lines = []
for i in range(len(df_pro_ind)):
    lines.append('>' + df_pro_ind['promoter_name'][i])
    lines.append(df_pro_ind['promoter_seq'][i])

open('bert/data/promoter.ind.txt', 'w').close()

with open('bert/data/promoter.ind.txt', 'w') as f:
    f.write('\n'.join(lines))

In [380]:
lines = []
for i in range(len(df_enh_frags)):
    lines.append('>' + df_enh_frags['enhancer_name'][i])
    lines.append(df_enh_frags['enhancer_seq'][i])

open('bert/data/enhancer.all.txt', 'w').close()

with open('bert/data/enhancer.all.txt', 'w') as f:
    f.write('\n'.join(lines))

In [381]:
lines = []
for i in range(len(df_pro_frags)):
    lines.append('>' + df_pro_frags['promoter_name'][i])
    lines.append(df_pro_frags['promoter_seq'][i])

open('bert/data/promoter.all.txt', 'w').close()

with open('bert/data/promoter.all.txt', 'w') as f:
    f.write('\n'.join(lines))