In [23]:
from Bio import SeqIO
import pandas as pd
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [50]:
train_list = []
test_list = []

max_seq_length = 1024

output_full_handle = open("deeploc_data_1024.fasta", 'w')
for record in SeqIO.parse("data/deeploc_data.fasta", "fasta"):
    description = record.description
    seq = record.seq
    desc_split = description.split(" ")
    if len(desc_split) == 3:
        ID = desc_split[0]
        label = desc_split[1]
        label = label[:len(label)-2]
        test_list.append((ID, label, seq))
    
    else:
        ID = desc_split[0]
        label = desc_split[1]
        label = label[:len(label)-2]
        if label == "Cytoplasm-Nucleus":
            label = "Cytoplasm"
        train_list.append((ID, label, seq))
    if len(seq) <= max_seq_length:
        SeqIO.write(record, output_full_handle, "fasta")

        
train_df = pd.DataFrame(train_list)
test_df = pd.DataFrame(test_list)




In [53]:
id_map = {}
for i, l in enumerate(test_df[1].unique()):
    id_map[l] = i
id_map
print(id_map)

{'Cell.membrane': 0, 'Cytoplasm': 1, 'Endoplasmic.reticulum': 2, 'Golgi.apparatus': 3, 'Lysosome/Vacuole': 4, 'Mitochondrion': 5, 'Nucleus': 6, 'Peroxisome': 7, 'Plastid': 8, 'Extracellular': 9}


In [57]:
print("LABEL COUNTS FOR TRAIN SET")
for k in id_map.keys():
    print("{}: {}".format(k,train_df[train_df[1]==k][0].count()))
    
print("\n\nLABEL COUNTS FOR TEST SET")
for k in id_map.keys():
    print("{}: {}".format(k,test_df[test_df[1]==k][0].count()))

LABEL COUNTS FOR TRAIN SET
Cell.membrane: 1067
Cytoplasm: 2180
Endoplasmic.reticulum: 689
Golgi.apparatus: 286
Lysosome/Vacuole: 257
Mitochondrion: 1208
Nucleus: 3235
Peroxisome: 124
Plastid: 605
Extracellular: 1580


LABEL COUNTS FOR test SET
Cell.membrane: 273
Cytoplasm: 508
Endoplasmic.reticulum: 173
Golgi.apparatus: 70
Lysosome/Vacuole: 64
Mitochondrion: 302
Nucleus: 808
Peroxisome: 30
Plastid: 152
Extracellular: 393


In [58]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(train_df, test_size=0.1, random_state=42)


In [59]:
output_train_handle = open("data/deeploc_train.fasta", 'w')
for index, row in train.iterrows():
    ID = row[0]
    label = row[1]
    seq = row[2]
    rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
    if len(seq) <= max_seq_length:
        SeqIO.write(rec, output_train_handle, "fasta")
    
output_valid_handle = open("data/deeploc_valid.fasta", 'w')
for index, row in validation.iterrows():
    ID = row[0]
    label = row[1]
    seq = row[2]
    rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
    if len(seq) <= max_seq_length:
        SeqIO.write(rec, output_valid_handle, "fasta")

output_test_handle = open("data/deeploc_test.fasta", 'w')
for index, row in test_df.iterrows():
    ID = row[0]
    label = row[1]
    seq = row[2]
    rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
    if len(seq) <= max_seq_length:
        SeqIO.write(rec, output_test_handle, "fasta")
    
    

    