In [29]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import make_classification
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation

def generate_input(df, window_radius = 3):
    _data = []
    for _, item in df.iterrows():
        seq = item.sequence
        length = len(seq)
        
        seq = ("_" * window_radius) + seq + ("_" * window_radius) #add spacer
        for resn in range(length):
            _in = list(seq[resn:resn+window_radius*2+1])
            _data.append(_in)
    return _data

def generate_label(df):
    label = []
    for _, item in df.iterrows():
        ss = item.label
        for resn, _label in enumerate(ss):
            label.append(int(_label))
    return np.array(label)


if __name__ == "__main__":
    

    ###### 1. data preparation ######
    
    # read csv files
    train_val_df = pd.read_csv('train.csv')
    test_df      = pd.read_csv('test.csv') 

    # split into train dataset and validation dataset (not train-test splitting)
    train_df, val_df = train_test_split(train_val_df, random_state=0)

    # extract subsequence
    window_radius = 3
    train_data_ = generate_input(train_df, window_radius)
    val_data_   = generate_input(val_df, window_radius)
    test_data_  = generate_input(test_df, window_radius) 
    
    # encode an amino acids sequence into a numerical vector
    # MUST use the same transformer for all data without refit 
    transformer = OneHotEncoder().fit(train_data_)
    train_data  = transformer.transform(train_data_)
    val_data    = transformer.transform(val_data_)
    test_data   = transformer.transform(test_data_)

    # extract label information
    # Note: NO LABEL INFORMATION for test dataset
    train_label = generate_label(train_df)
    val_label   = generate_label(val_df)
    # test_label = None


    # rename for interpretability
    X_train, Y_train = train_data, train_label
    X_val,   Y_val   = val_data,   val_label
    X_test           = test_data

  

In [31]:
    model = Sequential()
    model.add(Dense(200, input_dim = 153, activation = 'relu')) 
    model.add(Dense(300, activation = 'relu'))
    model.add(Dense(200, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    
    history = model.fit(X_train, Y_train, epochs=150, batch_size=2000, validation_data=(X_val, Y_val))  


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [52]:
    predicted = model.predict(X_test)
    predicted = np.array(predicted).flatten()
    print(predicted)

[2.7121190e-15 3.0844718e-02 5.3759813e-03 ... 3.5771638e-02 1.0001516e-01
 6.1597434e-15]


In [53]:
    sequence_id_list    = []
    residue_number_list = []
    for _, item in test_df.iterrows():
        sequence_id = item.sequence_id
        sequence    = item.sequence
        for i, aa in enumerate(sequence):
            sequence_id_list.append(sequence_id)
            residue_number_list.append(i+1) #0-origin to 1-origin

    predicted_df = pd.DataFrame.from_dict({
        "sequence_id": sequence_id_list,
        "residue_number": residue_number_list,
        "predicted_value": predicted,
        })
    predicted_df.to_csv('output_5_layer_nn.csv', index=None)