In [12]:
from Bio import SeqIO
import pandas as pd
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [13]:
train_list = []
test_list = []

max_seq_length = 1024

output_full_handle = open("deeploc_data_1024.fasta", 'w')
for record in SeqIO.parse("data/deeploc_data.fasta", "fasta"):
    description = record.description
    seq = record.seq
    desc_split = description.split(" ")
    if len(desc_split) == 3:
        ID = desc_split[0]
        label = desc_split[1]
        label = label[:len(label)-2]
        train_list.append((ID, label, seq))
    
    else:
        ID = desc_split[0]
        label = desc_split[1]
        label = label[:len(label)-2]
        train_list.append((ID, label, seq))
    if len(seq) <= max_seq_length:
        SeqIO.write(record, output_full_handle, "fasta")

        
train_df = pd.DataFrame(train_list)
test_df = pd.DataFrame(test_list)




In [14]:
id_map = {}
for i, l in enumerate(train_df[1].unique()):
    id_map[l] = i
print(id_map)

{'Cell.membrane': 0, 'Cytoplasm-Nucleus': 1, 'Cytoplasm': 2, 'Endoplasmic.reticulum': 3, 'Golgi.apparatus': 4, 'Lysosome/Vacuole': 5, 'Mitochondrion': 6, 'Nucleus': 7, 'Peroxisome': 8, 'Plastid': 9, 'Extracellular': 10}


In [15]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(train_df, test_size=0.1, random_state=42)


In [16]:
output_train_handle = open("data/deeploc_train.fasta", 'w')
for index, row in train.iloc[6142:].iterrows():
    ID = row[0]
    label = row[1]
    seq = row[2]
    rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
    if len(seq) <= max_seq_length:
        SeqIO.write(rec, output_train_handle, "fasta")
    
output_valid_handle = open("data/deeploc_valid.fasta", 'w')
for index, row in validation.iterrows():
    ID = row[0]
    label = row[1]
    seq = row[2]
    rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
    if len(seq) <= max_seq_length:
        SeqIO.write(rec, output_valid_handle, "fasta")

output_test_handle = open("data/deeploc_test.fasta", 'w')
for index, row in test_df.iterrows():
    ID = row[0]
    label = row[1]
    seq = row[2]
    rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
    if len(seq) <= max_seq_length:
        SeqIO.write(rec, output_test_handle, "fasta")
    
    

    