## This notebook contains the relevant code to create train and test sets for proteins with alignment and without alignment. 

In [None]:
import pandas as pd
import ast

In [None]:
originalDf = pd.read_csv("dataset.csv")

In [None]:
# Reading in csv file containing aligned sequences
alignDf = pd.read_csv("AlignmentFiles.csv", names = ['id', 'Sequence Length', 'alignment'])

In [None]:
alignDf.head()

In [None]:
del originalDf['Sequence Length']

In [None]:
# Renaming column
originalDf.rename(columns = {'Unnamed: 0':'id'}, inplace = True)

In [None]:
# Updating id index to match correct index
originalDf['id'] += 1

In [None]:
# Renaming column
originalDf.rename(columns = {'#_AA_refereneced':'Sequence Length'}, inplace = True)

In [None]:
# Merging aligned sequence dataset with original dataset
# Allows to combine relevant information with aligned sequences
mergedDf = pd.merge(originalDf, alignDf, on=['id'], how='inner')

In [None]:
mergedDf

In [None]:
# Making a copy of the new dataframe in order to split into test and train sets
dfToSplit = mergedDf[['id', 'shortened_sequence', 'Updated Indexes', 'Sequence Length', 'alignment']].copy()

In [None]:
# Renaming columns
dfToSplit.rename(columns = {'Updated Indexes':'Updated_Indexes'}, inplace = True)

In [None]:
dfToSplit.head(10)

In [None]:
# Create test set consisting of every 4th row in the dataset
testSetDf = dfToSplit.iloc[::4,:]

In [None]:
# Ensuring the test set is same size as non-alignment test set
# in order to allow for consistent comparison in evaluation stages of project
testSetDf = testSetDf.sample(4523, random_state = 2)

In [None]:
# Add remaining rows to train dataset
trainSetDf = pd.concat([dfToSplit, testSetDf, testSetDf]).drop_duplicates(keep=False)

In [None]:
# Ensuring the train set is same size as non-alignment train set
# in order to allow for consistent comparison in evaluation stages of project
trainSetDf = trainSetDf.sample(13568, random_state = 2)

In [None]:
# order rows based on id
testSetDf = testSetDf.sort_values(by=['id'])

In [None]:
testSetDf

In [None]:
# order rows based on id
trainSetDf = trainSetDf.sort_values(by=['id'])

In [None]:
trainSetDf

In [None]:
# Extracting relevant indexes from dataframe in order to extract the corresponding class type
# for each amino acid in the sequence - to be outputted in the resulting text file
test_list = []

for rows in testSetDf.itertuples(): # iterating through rows in dataframe
    testSetIndexes = [rows.Updated_Indexes] 
    test_list.append(testSetIndexes) # Adding class indexes to list
    
flat_test_list = [] 

# Ensuring list is in correct format
for sublist in test_list:
    for item in sublist:
        flat_test_list.append(item)

In [None]:
# Converting elements in list from string to list
testdf_x = [ast.literal_eval(s) for s in flat_test_list]

In [None]:
classTypeList = []

for value in testdf_x: # for each value in the test list
    classType = ''
    for x in value: # Print corresponding class for each amino acid in the sequence
        classType += ((x[1] - x[0] + 1) * ((x[2]) + ' '))
    classTypeList.append(classType) # Add result to list

In [None]:
# add list as column in dataframe
testSetDf['correspondingClassToAA'] = classTypeList

In [None]:
# Replacing class letter with corresponding class number (i.e.) 0 for order, 1 for disorder, 2 for ambiguity
testSetDf['correspondingClassToAA'] = testSetDf['correspondingClassToAA'].str.replace('D','1')
testSetDf['correspondingClassToAA'] = testSetDf['correspondingClassToAA'].str.replace('S','0')
testSetDf['correspondingClassToAA'] = testSetDf['correspondingClassToAA'].str.replace('C','2')

In [None]:
# Replacing each letter in the protein sequence with one-hot encoding representation
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('A','1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('B','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('C','0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('D','0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('E','0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('F','0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('G','0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('H','0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('I','0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('J','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('K','0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('L','0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('M','0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('N','0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('O','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('P','0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('Q','0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('R','0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('S','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('T','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('U','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('V','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('W','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('X','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('Y','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 ')
testSetDf['shortened_sequence'] = testSetDf['shortened_sequence'].str.replace('Z','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')

In [None]:
# Writing non alignment test set data to file in correct format
# File will be renamed manually to 'test.dataset' when used in experimentation

file=open('nonAlignment_test.dataset','a', buffering = 4000000)

for idx,row in testSetDf.iterrows(): # iterate through dataset 
    nnInputTest = str(row['id'])+ '\n' # first line consists of protein ID
    nnInputTest += str(row['Sequence Length'])+ '\n' # Second line consists of protein sequence length
    for sequence in row['shortened_sequence']: # third line consists of protein sequence
        nnInputTest += str(sequence)
    nnInputTest += '\n'
    nnInputTest += str(row['correspondingClassToAA'])+ '\n' # final line consists of class corresponding to each amino acid in sequence    
    nnInputTest += '\n'
    file.write(nnInputTest) # write to file

In [None]:
# Writing aligned test set data to file in correct format

file=open('test.dataset','a', buffering = 4000000)

for idx,row in testSetDf.iterrows(): # iterate through dataset 
    nnInputTest = str(row['id'])+ '\n' # first line consists of protein ID
    nnInputTest += str(row['Sequence Length'])+ '\n' # Second line consists of protein sequence length
    for sequence in row['alignment']: # third line consists of aligned protein sequence
        nnInputTest += str(sequence)
    nnInputTest += '\n'
    nnInputTest += str(row['correspondingClassToAA'])+ '\n' # final line consists of class corresponding to each amino acid in sequence    
    nnInputTest += '\n'
    file.write(nnInputTest) # write to file

In [None]:
# Extracting relevant indexes from dataframe in order to extract the corresponding class type
# for each amino acid in the sequence - to be outputted in the resulting text file
train_list = []
for rows in trainSetDf.itertuples(): # iterating through rows in dataframe
    trainSetIndexes = [rows.Updated_Indexes]
    train_list.append(trainSetIndexes) # Adding class indexes to list

# Ensuring list is in correct format
flat_train_list = []
for sublist in train_list:
    for item in sublist:
        flat_train_list.append(item)

In [None]:
# Converting elements in list from string to list
traindf_x = [ast.literal_eval(s) for s in flat_train_list]

In [None]:
classTypeList = []

for value in traindf_x: # for each value in the test list
    classType = ''
    for x in value: # Printing corresponding class for each amino acid in the sequence
        classType += ((x[1] - x[0]+1) * ((x[2])+' '))
    classTypeList.append(classType)  # Add result to list

In [None]:
# add list as column in dataframe
trainSetDf['correspondingClassToAA'] = classTypeList

In [None]:
# Replacing class letter with corresponding class number (i.e.) 0 for order, 1 for disorder, 2 for ambiguity
trainSetDf['correspondingClassToAA'] = trainSetDf['correspondingClassToAA'].str.replace('D','1')
trainSetDf['correspondingClassToAA'] = trainSetDf['correspondingClassToAA'].str.replace('S','0')
trainSetDf['correspondingClassToAA'] = trainSetDf['correspondingClassToAA'].str.replace('C','2')

In [None]:
# Replacing each letter in the protein sequence with one-hot encoding representation
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('A','1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('B','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('C','0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('D','0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('E','0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('F','0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('G','0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('H','0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('I','0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('J','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('K','0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('L','0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('M','0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('N','0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('O','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('P','0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('Q','0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('R','0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('S','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('T','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('U','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('V','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('W','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('X','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('Y','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 ')
trainSetDf['shortened_sequence'] = trainSetDf['shortened_sequence'].str.replace('Z','0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ')

In [None]:
# Writing non alignment train set data to file in correct format
# File will be renamed manually to 'train.dataset' when used in experimentation

file=open('nonAlignment_train.dataset','a', buffering = 4000000)

for idx,row in trainSetDf.iterrows(): # iterate through dataset
    nnInputTrain  = str(row['id'])+ '\n' # first line consists of protein ID
    nnInputTrain += str(row['Sequence Length'])+ '\n'  # Second line consists of protein sequence length
    for sequence in row['shortened_sequence']: # third line consists of protein sequence
        nnInputTrain += str(sequence)
    nnInputTrain += '\n'
    nnInputTrain += str(row['correspondingClassToAA'])+ '\n' # final line consists of class corresponding to each amino acid in sequence    
    nnInputTrain += '\n'
    file.write(nnInputTrain) # write to file

In [None]:
# Writing alignment train set data to file in correct format

file=open('train.dataset','a', buffering = 4000000)

for idx,row in trainSetDf.iterrows(): # iterate through dataset
    nnInputTrain  = str(row['id'])+ '\n' # first line consists of protein ID
    nnInputTrain += str(row['Sequence Length'])+ '\n'  # Second line consists of protein sequence length
    for product in row['alignment']: # third line consists of aligned protein sequence 
        nnInputTrain += str(product)
    nnInputTrain += '\n'
    nnInputTrain += str(row['correspondingClassToAA'])+ '\n' # final line consists of class corresponding to each amino acid in sequence    
    nnInputTrain += '\n'
    file.write(nnInputTrain) # write to file