In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from IPython.display import display
import statistics as st

In [77]:
CELL_LINE = 'combined' # GM12878, HUVEC, HeLa-S3, IMR90, K562, NHEK, combined

In [78]:
ep_sequences = pd.read_csv('gcn/data/{}/frag_pairs_balanced.csv'.format(CELL_LINE))
display(ep_sequences)

Unnamed: 0,enhancer_name,enhancer_frag_name,enhancer_frag_seq,promoter_name,promoter_frag_name,promoter_frag_seq
0,K562|chr1:16399567-16400081,K562|chr1:16399567-16399767,CGCAGTTCCTCCGGGAAGCCAGGAAGAACCAGAGCTGGTTCCCAGG...,K562|chr1:16677563-16679703,K562|chr1:16677563-16677763,ATGGTACTTCCCCCAAATTTTCCTTGCCTCTTTAAATCCATGATGT...
1,K562|chr1:16399567-16400081,K562|chr1:16399567-16399767,CGCAGTTCCTCCGGGAAGCCAGGAAGAACCAGAGCTGGTTCCCAGG...,K562|chr1:16677563-16679703,K562|chr1:16677763-16677963,TACATGACAGAAACACAATCGAAGTCTCCCAATTAGCATTTAAACA...
2,K562|chr1:16399567-16400081,K562|chr1:16399567-16399767,CGCAGTTCCTCCGGGAAGCCAGGAAGAACCAGAGCTGGTTCCCAGG...,K562|chr1:16677563-16679703,K562|chr1:16677963-16678163,GCTTTTATTTTCCTAACAGAAATCGGGAGATTTAACAGACAAAGCA...
3,K562|chr1:16399567-16400081,K562|chr1:16399567-16399767,CGCAGTTCCTCCGGGAAGCCAGGAAGAACCAGAGCTGGTTCCCAGG...,K562|chr1:16677563-16679703,K562|chr1:16678163-16678363,CAAGAGAAGCCTTCCACAGTCACAGCATGGATTGCTGTCTCAAGCA...
4,K562|chr1:16399567-16400081,K562|chr1:16399567-16399767,CGCAGTTCCTCCGGGAAGCCAGGAAGAACCAGAGCTGGTTCCCAGG...,K562|chr1:16677563-16679703,K562|chr1:16678363-16678563,TTGATGGGCACCCGACCCCCGACTCCTCTCTCGCAAAGGAGTGAAA...
...,...,...,...,...,...,...
121104,IMR90|chrX:109084400-109085400,IMR90|chrX:109084800-109085000,CAATCTTTTCCTTCCACAGCAGACATTCTGACATGTAGCTCTCAGT...,IMR90|chrX:108976400-108976800,IMR90|chrX:108976600-108976800,GTGGCAGAGCCAGCCGAATCGCTGGCTGCAGACGGCTCGGCCCACC...
121105,IMR90|chrX:109084400-109085400,IMR90|chrX:109085000-109085200,TTTCAAAAGCTCCTTGGAAAAAAAATGCTATCTGTGGCTTGGATTT...,IMR90|chrX:108976400-108976800,IMR90|chrX:108976400-108976600,GCCGGCGCCTGGCACTCGGAAAGCTCGCAAAAAGGAACCGCGTGCC...
121106,IMR90|chrX:109084400-109085400,IMR90|chrX:109085000-109085200,TTTCAAAAGCTCCTTGGAAAAAAAATGCTATCTGTGGCTTGGATTT...,IMR90|chrX:108976400-108976800,IMR90|chrX:108976600-108976800,GTGGCAGAGCCAGCCGAATCGCTGGCTGCAGACGGCTCGGCCCACC...
121107,IMR90|chrX:109084400-109085400,IMR90|chrX:109085200-109085400,AGGGACAGTATACAAATTTTGAACTTAAAGAAAAGTTTTTGGTAAG...,IMR90|chrX:108976400-108976800,IMR90|chrX:108976400-108976600,GCCGGCGCCTGGCACTCGGAAAGCTCGCAAAAAGGAACCGCGTGCC...


In [79]:
print(len(set(ep_sequences['enhancer_name'])), 'unique enhancers')
print(len(set(ep_sequences['enhancer_frag_name'])), 'unique enhancer frags')
print(len(set(ep_sequences['promoter_name'])), 'unique promoters')
print(len(set(ep_sequences['promoter_frag_name'])), 'unique promoter frags')

3079 unique enhancers
9903 unique enhancer frags
1010 unique promoters
9903 unique promoter frags


In [80]:
ep_sequences = ep_sequences[['enhancer_frag_name','enhancer_frag_seq','promoter_frag_name','promoter_frag_seq']]
ep_sequences.columns = ['enhancer_name','enhancer_seq','promoter_name','promoter_seq']

# TRAIN (cv) and TEST (ind) SPLIT

In [81]:
df_enh_frags = ep_sequences.drop_duplicates(subset=['enhancer_name'])[['enhancer_name', 'enhancer_seq']].reset_index(drop=True)
df_pro_frags = ep_sequences.drop_duplicates(subset=['promoter_name'])[['promoter_name', 'promoter_seq']].reset_index(drop=True)

df_enh_cv, df_enh_ind = train_test_split(df_enh_frags, test_size=0.1, random_state=42)
df_pro_cv, df_pro_ind = train_test_split(df_pro_frags, test_size=0.1, random_state=42)

df_enh_cv = df_enh_cv.reset_index(drop=True)
df_enh_ind = df_enh_ind.reset_index(drop=True)
df_pro_cv = df_pro_cv.reset_index(drop=True)
df_pro_ind = df_pro_ind.reset_index(drop=True)

print(len(df_enh_cv), 'train fragments and', len(df_enh_ind), 'test fragments')

8912 train fragments and 991 test fragments


# WRITE AS FASTA FILES (for bert-enhancer)

In [82]:
isExist = os.path.exists('bert/data/{}'.format(CELL_LINE))
if not isExist:
    os.makedirs('bert/data/{}'.format(CELL_LINE))

In [83]:
lines = []
for i in range(len(df_enh_cv)):
    lines.append('>' + df_enh_cv['enhancer_name'][i])
    lines.append(df_enh_cv['enhancer_seq'][i])

open('bert/data/{}/enhancer.cv.txt'.format(CELL_LINE), 'w').close()

with open('bert/data/{}/enhancer.cv.txt'.format(CELL_LINE), 'w') as f:
    f.write('\n'.join(lines))

In [84]:
lines = []
for i in range(len(df_pro_cv)):
    lines.append('>' + df_pro_cv['promoter_name'][i])
    lines.append(df_pro_cv['promoter_seq'][i])

open('bert/data/{}/promoter.cv.txt'.format(CELL_LINE), 'w').close()

with open('bert/data/{}/promoter.cv.txt'.format(CELL_LINE), 'w') as f:
    f.write('\n'.join(lines))

In [85]:
lines = []
for i in range(len(df_enh_ind)):
    lines.append('>' + df_enh_ind['enhancer_name'][i])
    lines.append(df_enh_ind['enhancer_seq'][i])

open('bert/data/{}/enhancer.ind.txt'.format(CELL_LINE), 'w').close()

with open('bert/data/{}/enhancer.ind.txt'.format(CELL_LINE), 'w') as f:
    f.write('\n'.join(lines))

In [86]:
lines = []
for i in range(len(df_pro_ind)):
    lines.append('>' + df_pro_ind['promoter_name'][i])
    lines.append(df_pro_ind['promoter_seq'][i])

open('bert/data/{}/promoter.ind.txt'.format(CELL_LINE), 'w').close()

with open('bert/data/{}/promoter.ind.txt'.format(CELL_LINE), 'w') as f:
    f.write('\n'.join(lines))

In [87]:
lines = []
for i in range(len(df_enh_frags)):
    lines.append('>' + df_enh_frags['enhancer_name'][i])
    lines.append(df_enh_frags['enhancer_seq'][i])

open('bert/data/{}/enhancer.all.txt'.format(CELL_LINE), 'w').close()

with open('bert/data/{}/enhancer.all.txt'.format(CELL_LINE), 'w') as f:
    f.write('\n'.join(lines))

In [88]:
lines = []
for i in range(len(df_pro_frags)):
    lines.append('>' + df_pro_frags['promoter_name'][i])
    lines.append(df_pro_frags['promoter_seq'][i])

open('bert/data/{}/promoter.all.txt'.format(CELL_LINE), 'w').close()

with open('bert/data/{}/promoter.all.txt'.format(CELL_LINE), 'w') as f:
    f.write('\n'.join(lines))