In [19]:
import os
import csv
import os.path as osp
import torch
from Bio import pairwise2
import shutil
import pandas as pd

In [20]:
BASE_DIR = os.getcwd()

DATA_DIR = osp.join(BASE_DIR, 'Data')

In [21]:
FILENAME = osp.join(DATA_DIR, 'miRAW_Train_Validation.txt')

In [22]:
PROCESSED_FILE = osp.join(DATA_DIR, 'processed_file.csv')

In [6]:
def reverse(seq):
    """ reverse the given sequence """
    seq_r = ""
    for i in range(len(seq)):
        seq_r = seq[::-1]
    return seq_r

In [7]:
score_matrix = {}  # Allow A-U wobble
for c1 in 'ACGU':
    for c2 in 'ACGU':
        if (c1, c2) in [('A', 'U'), ('U', 'A'), ('G', 'C'), ('C', 'G')]:
            score_matrix[(c1, c2)] = 1
        elif (c1, c2) in [('U', 'G'), ('G', 'U')]:
            score_matrix[(c1, c2)] = 1
        else:
            score_matrix[(c1, c2)] = 0


def extended_seed_alignment(mi_seq, cts_r_seq):
    """ extended seed alignment """
    alignment = pairwise2.align.globaldx(mi_seq[:10], cts_r_seq[5:15], score_matrix, one_alignment_only=True)[0]
    mi_esa = str(alignment[0])
    cts_r_esa = str(alignment[1])
    esa_score = alignment[2]
    return mi_esa, cts_r_esa, esa_score

In [8]:
def process_Train_val_file(filename, split_idx=None):
    foldername = filename.split(".")[0]
    folderpath = osp.join(DATA_DIR, foldername, '')    
    
    try:
        shutil.rmtree(folderpath)
    except Exception as e:
        print(e)

    os.mkdir(folderpath)
    
    with open(osp.join(DATA_DIR, filename), "r") as f, open(osp.join(folderpath, f'{foldername}.csv'), 'w') as file:        
        HEADER = ["miRNA", "miRNA_esa", "rev_mRNA", "mRNA_CTS_esa", "esa_score", "Label", "Split"]
        writer = csv.writer(file)
        writer.writerow(HEADER)
        lines = f.readlines()
        for i, line in enumerate(lines[1:]): 
            tokens = line.strip().split("\t")
            mirna_id, mirna_seq, mrna_id, mrna_seq = tokens[:4]
            label = float(tokens[4]) if len(tokens) > 4 else 0
            sp = None if len(tokens) == 5 else tokens[-1]
            if split_idx in ["train", "val"] and tokens[5] != split_idx: continue

            mirna_seq = mirna_seq.upper().replace("T", "U")
            mrna_seq = mrna_seq.upper().replace("T", "U")
            mrna_rev_seq = reverse(mrna_seq)


            for pos in range(len(mrna_rev_seq) - 40 + 1):
                mr_slid = mrna_rev_seq[pos: pos + 40]
                mi_esa, r_cts_esa, esa_score = extended_seed_alignment(mirna_seq, mr_slid)
                if esa_score >= 6.0:
                    new_line = [mirna_seq, mi_esa, mr_slid, r_cts_esa, esa_score, label, sp]
                    writer.writerow(new_line)
    return None

In [9]:
process_Train_val_file('miRAW_Train_Validation.txt')

In [10]:
def process_file(filename, split_idx=None):
    foldername = filename.split(".")[0]
    folderpath = osp.join(DATA_DIR, foldername)
    
    with open(filename, "r") as f:
        lines = f.readlines()
    
        try:
            shutil.rmtree(folderpath)
        except Exception as e:
            print(e)

        os.mkdir(folderpath)    

        for i, line in enumerate(lines[1:]):
            with open(osp.join(folderpath, f'seq{i}.csv'), 'w') as file:
                HEADER = ["miRNA", "miRNA_esa", "rev_mRNA", "mRNA_CTS_esa", "esa_score", "Label", "Split"]
                writer = csv.writer(file)
                writer.writerow(HEADER)

                tokens = line.strip().split("\t")
                mirna_id, mirna_seq, mrna_id, mrna_seq = tokens[:4]
                label = float(tokens[4]) if len(tokens) > 4 else 0
                sp = None if len(tokens) == 5 else tokens[-1]
                if split_idx in ["train", "val"] and tokens[5] != split_idx: continue

                mirna_seq = mirna_seq.upper().replace("T", "U")
                mrna_seq = mrna_seq.upper().replace("T", "U")
                mrna_rev_seq = reverse(mrna_seq)

   
                for pos in range(len(mrna_rev_seq) - 40 + 1):
                    mr_slid = mrna_rev_seq[pos: pos + 40]
                    mi_esa, r_cts_esa, esa_score = extended_seed_alignment(mirna_seq, mr_slid)
                    if esa_score >= 6.0:
                        line = [mirna_seq, mi_esa, mr_slid, r_cts_esa, esa_score, label, sp]
                        writer.writerow(line)
    return None


In [11]:
# INPUT_FILES_LIST = [osp.join(DATA_DIR,'miRAW_Train_Validation.txt')]
for i in range(0, 10):
    input_file = osp.join(DATA_DIR, f'miRAW_Test{i}.txt')
    process_file(input_file)
#     INPUT_FILES_LIST.append(input_file)