# Create DNA dataset out of RNA dataset 

To train 3'UTR DNA models, we create a DNA dataset out of the RNA dataset by reverse-complementing sequences on the negative strand

In [1]:
import pandas as pd
from textwrap import wrap

In [2]:
#3'UTR table

data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/'

strand_info = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_name','strand'], usecols=[3,5]).set_index('seq_name').strand

In [3]:
fasta_in =  data_dir + 'fasta/241_mammals.shuffled.fa'
fasta_out =  data_dir + 'fasta/241_mammals_DNA.shuffled.fa'

In [14]:
seq = ''
seq_header = None

def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq
    
def dump_seq(fout,seq_header,seq):
    if seq_header and seq:
        seq_name = seq_header.split(':')[0][1:]
        fout.write(seq_header+'\n')
        if strand_info.loc[seq_name]=='-':
            seq = reverse_complement(seq)
        for s in wrap(seq,80):
            fout.write(s+'\n')

seq_idx = 0
with open(fasta_out,'w') as fout:
    with open(fasta_in,'r') as fin:
        for line in fin:
            if line.startswith('>'):
                dump_seq(fout,seq_header,seq)
                seq_header = line.rstrip()
                seq = ''
                seq_idx += 1
                if seq_idx%100_000==0:
                    print(f'{seq_idx} sequences processed')
            else:
                seq += line.rstrip()
        
    dump_seq(fout,seq_header,seq)

100000 sequences processed
200000 sequences processed
300000 sequences processed
400000 sequences processed
500000 sequences processed
600000 sequences processed
700000 sequences processed
800000 sequences processed
900000 sequences processed
1000000 sequences processed
1100000 sequences processed
1200000 sequences processed
1300000 sequences processed
1400000 sequences processed
1500000 sequences processed
1600000 sequences processed
1700000 sequences processed
1800000 sequences processed
1900000 sequences processed
2000000 sequences processed
2100000 sequences processed
2200000 sequences processed
2300000 sequences processed
2400000 sequences processed
2500000 sequences processed
2600000 sequences processed
2700000 sequences processed
2800000 sequences processed
2900000 sequences processed
3000000 sequences processed
3100000 sequences processed
3200000 sequences processed
3300000 sequences processed
3400000 sequences processed
3500000 sequences processed
3600000 sequences processed
3

In [15]:
!samtools faidx {fasta_out}