In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation

def generate_input(df, window_radius = 3):
    _data = []
    for _, item in df.iterrows():
        seq = item.sequence
        length = len(seq)
        
        seq = ("_" * window_radius) + seq + ("_" * window_radius) #add spacer
        for resn in range(length):
            _in = list(seq[resn:resn+window_radius*2+1])
            _data.append(_in)
    return _data

def generate_label(df):
    label = []
    for _, item in df.iterrows():
        ss = item.label
        for resn, _label in enumerate(ss):
            label.append(int(_label))
    return np.array(label)


if __name__ == "__main__":
    

    ###### 1. data preparation ######
    
    # read csv files
    train_val_df = pd.read_csv('train.csv')
    test_df      = pd.read_csv('test.csv') 

    # split into train dataset and validation dataset (not train-test splitting)
    train_df, val_df = train_test_split(train_val_df, random_state=0)

    # extract subsequence
    window_radius = 20
    train_data_ = generate_input(train_df, window_radius)
    val_data_   = generate_input(val_df, window_radius)
    test_data_  = generate_input(test_df, window_radius) 
    
    # encode an amino acids sequence into a numerical vector
    # MUST use the same transformer for all data without refit 
    transformer = OneHotEncoder().fit(train_data_)
    train_data  = transformer.transform(train_data_)
    val_data    = transformer.transform(val_data_)
    test_data   = transformer.transform(test_data_)

    # extract label information
    # Note: NO LABEL INFORMATION for test dataset
    train_label = generate_label(train_df)
    val_label   = generate_label(val_df)
    # test_label = None


    # rename for interpretability
    X_train, Y_train = train_data, train_label
    X_val,   Y_val   = val_data,   val_label
    X_test           = test_data
    
    print(np.shape(X_val))

  

(375781, 901)


In [9]:
    model = Sequential()
    model.add(Dense(200, input_dim = 901, activation = 'relu')) 
    model.add(Dense(300, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    
    history = model.fit(X_train, Y_train, epochs=10, batch_size=2000, validation_data=(X_val, Y_val))  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
    predicted = model.predict(X_test)
    predicted = np.array(predicted).flatten()
    print(predicted)

[6.3320662e-08 3.7277490e-02 9.2642099e-02 ... 2.6901633e-02 1.3138056e-03
 1.5978113e-09]


In [11]:
    sequence_id_list    = []
    residue_number_list = []
    for _, item in test_df.iterrows():
        sequence_id = item.sequence_id
        sequence    = item.sequence
        for i, aa in enumerate(sequence):
            sequence_id_list.append(sequence_id)
            residue_number_list.append(i+1) #0-origin to 1-origin

    predicted_df = pd.DataFrame.from_dict({
        "sequence_id": sequence_id_list,
        "residue_number": residue_number_list,
        "predicted_value": predicted,
        })
    predicted_df.to_csv('output_5_layer_nn_1.csv', index=None)