# Generating DeepLoc Train/Valid/Test Data

In this notebook, we parse through the original DeepLoc dataset to generate the train/test splits as well as a validation set from the test set. You can specify the max sequence length for your output data, which in our case we used 6000 due to memory issues on EC2 when training on a GPU. 

Additionally, we remove the Cytoplasm/Nucleus class as done in the original DeepLoc paper to mimic their data cleaning process. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [2]:
np.random.seed(42)

In [3]:
train_list = []
test_list = []

max_seq_length = 6000


with open("../data/deeploc_data_{}.fasta".format(max_seq_length), 'w') as output_full_handle:
    for record in SeqIO.parse("../data/deeploc_data.fasta", "fasta"):
        description = record.description
        seq = record.seq
        desc_split = description.split(" ")
        if len(desc_split) == 3:
            ID = desc_split[0]
            label = desc_split[1]
            label = label[:len(label)-2]
            test_list.append((ID, label, seq))
        else:
            ID = desc_split[0]
            label = desc_split[1]
            label = label[:len(label)-2]
            train_list.append((ID, label, seq))
        if len(seq) <= max_seq_length:
            SeqIO.write(record, output_full_handle, "fasta")
        
train_df = pd.DataFrame(train_list)
test_df = pd.DataFrame(test_list)

In [4]:
print(train_df.shape)
print(test_df.shape)

(11231, 3)
(2773, 3)


In [5]:
train_df.head()

Unnamed: 0,0,1,2
0,Q5I0E9,Cell.membrane,"(M, E, V, L, E, E, P, A, P, G, P, G, G, A, D, ..."
1,P63033,Cell.membrane,"(M, M, K, T, L, S, S, G, N, C, T, L, N, V, P, ..."
2,Q9NR71,Cell.membrane,"(M, A, K, R, T, F, S, N, L, E, T, F, L, I, F, ..."
3,Q86XT9,Cell.membrane,"(M, G, N, C, Q, A, G, H, N, L, H, L, C, L, A, ..."
4,A2CI98,Cell.membrane,"(M, D, P, S, K, Q, G, T, L, N, R, V, E, N, S, ..."


In [6]:
train_df[1].value_counts()

Nucleus                  3235
Cytoplasm                2034
Extracellular            1580
Mitochondrion            1208
Cell.membrane            1067
Endoplasmic.reticulum     689
Plastid                   605
Golgi.apparatus           286
Lysosome/Vacuole          257
Cytoplasm-Nucleus         146
Peroxisome                124
Name: 1, dtype: int64

In [7]:
test_df[1].value_counts()

Nucleus                  808
Cytoplasm                508
Extracellular            393
Mitochondrion            302
Cell.membrane            273
Endoplasmic.reticulum    173
Plastid                  152
Golgi.apparatus           70
Lysosome/Vacuole          64
Peroxisome                30
Name: 1, dtype: int64

In [8]:
# Remove ambiguous Cytoplasm-Nucleus class
train_df = train_df[train_df[1] != 'Cytoplasm-Nucleus']

In [9]:
print(train_df.shape)
train_df[1].value_counts()

(11085, 3)


Nucleus                  3235
Cytoplasm                2034
Extracellular            1580
Mitochondrion            1208
Cell.membrane            1067
Endoplasmic.reticulum     689
Plastid                   605
Golgi.apparatus           286
Lysosome/Vacuole          257
Peroxisome                124
Name: 1, dtype: int64

In [10]:
id_map = {}
for i, l in enumerate(train_df[1].unique()):
    id_map[l] = i
print(id_map)

{'Cell.membrane': 0, 'Cytoplasm': 1, 'Endoplasmic.reticulum': 2, 'Golgi.apparatus': 3, 'Lysosome/Vacuole': 4, 'Mitochondrion': 5, 'Nucleus': 6, 'Peroxisome': 7, 'Plastid': 8, 'Extracellular': 9}


In [11]:
train_labels = []
for label in train_df[1]:
    train_labels.append(id_map[label])

In [12]:
print(train_labels[:10])
print(train_labels[-10:])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[9, 9, 9, 9, 9, 9, 9, 9, 9, 9]


In [13]:
train, validation = train_test_split(train_df, test_size=0.1, stratify=train_labels)

In [14]:
print(train.shape)
print(validation.shape)

(9976, 3)
(1109, 3)


In [15]:
train[1].value_counts()

Nucleus                  2911
Cytoplasm                1831
Extracellular            1422
Mitochondrion            1087
Cell.membrane             960
Endoplasmic.reticulum     620
Plastid                   545
Golgi.apparatus           257
Lysosome/Vacuole          231
Peroxisome                112
Name: 1, dtype: int64

In [16]:
validation[1].value_counts()

Nucleus                  324
Cytoplasm                203
Extracellular            158
Mitochondrion            121
Cell.membrane            107
Endoplasmic.reticulum     69
Plastid                   60
Golgi.apparatus           29
Lysosome/Vacuole          26
Peroxisome                12
Name: 1, dtype: int64

In [20]:
with open("../data/deeploc_train.fasta", 'w') as output_train_handle:
    for index, row in train.iterrows():
        ID = row[0]
        label = row[1]
        seq = row[2]
        rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
        if len(seq) <= max_seq_length:
            SeqIO.write(rec, output_train_handle, "fasta")
    
with open("../data/deeploc_valid.fasta", 'w') as output_valid_handle:
    for index, row in validation.iterrows():
        ID = row[0]
        label = row[1]
        seq = row[2]
        rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
        if len(seq) <= max_seq_length:
            SeqIO.write(rec, output_valid_handle, "fasta")

with open("../data/deeploc_test.fasta", 'w') as output_test_handle:
    for index, row in test_df.iterrows():
        ID = row[0]
        label = row[1]
        seq = row[2]
        rec = SeqRecord(seq, id=ID, description=str(id_map[label]))
        if len(seq) <= max_seq_length:
            SeqIO.write(rec, output_test_handle, "fasta")