In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

AMINO_ACIDS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
         'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

combined = [train_df, test_df]
train_df

Unnamed: 0,Sequence,Active
0,DKWL,0
1,FCHN,0
2,KDQP,0
3,FNWI,0
4,NKRM,0
...,...,...
111995,GSME,0
111996,DLPT,0
111997,SGHC,0
111998,KIGT,0


In [3]:
train_df['Active'].value_counts()

0    107787
1      4213
Name: Active, dtype: int64

In [4]:
labels = {}
for i, val in enumerate(AMINO_ACIDS):
    labels[val] = i

labels

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19}

In [12]:
def encode_seq(df):
    data_encoded = []
    for row in df['Sequence'].values:
        row_encoded = []
        for c in row:
            row_encoded.append(labels[c])
        data_encoded.append(np.array(row_encoded))

    return data_encoded

X = np.array(encode_seq(train_df))
y = train_df['Active'].values

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)
y_train = to_categorical(y_train)

In [18]:
embedding_dim = 32

model = Sequential()
model.add(Embedding(20, embedding_dim, input_length=4))
model.add(LSTM(100))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 4, 32)             640       
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 202       
Total params: 54,042
Trainable params: 54,042
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
model.fit(X_train, y_train, validation_split=0.33, epochs=3, batch_size=16)

Train on 50276 samples, validate on 24764 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7fccb03f2c10>

In [22]:
# validation
y_pred = model.predict_classes(X_val)
print(y_pred)
y_test = y_val
print(y_test)
print(f1_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), confusion_matrix(y_test, y_pred))

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
0.7163858013624955 0.7030260380014075 0.7302631578947368 [[35170   422]
 [  369   999]]


In [18]:
# make submission
X_test = np.array(encode_seq(test_df))

y_test = model.predict_classes(X_test)
submission = pd.DataFrame(y_test)
submission.to_csv('submission.csv', header=False, index=False)

In [29]:
submission.describe()

Unnamed: 0,0
count,48000.0
mean,0.112167
std,0.315575
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0
