In [7]:
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import time
from sklearn.utils import shuffle

In [8]:
def generate_input(df, window_radius=1):
    _data = []
    for _, item in df.iterrows():
        seq = item.sequence
        length = len(seq)
        
        seq = ("_" * window_radius) + seq + ("_" * window_radius) #add spacer
        for resn in range(length):
            _in = list(seq[resn:resn+window_radius*2+1])
            _data.append(_in)
    return _data

def generate_label(df):
    label = []
    for _, item in df.iterrows():
        ss = item.label
        for resn, _label in enumerate(ss):
            label.append(int(_label))
    return np.array(label)


In [9]:
    # read csv files
    train_val_df = pd.read_csv('/content/train.csv')
    test_df      = pd.read_csv('/content/test.csv') 

    # split into train dataset and validation dataset (not train-test splitting)
    train_df, val_df = train_test_split(train_val_df, random_state=0)

    # extract subsequence
    window_radius = 20
    train_data_ = generate_input(train_df, window_radius)
    val_data_   = generate_input(val_df, window_radius)
    test_data_  = generate_input(test_df, window_radius) if (test_df is not None) else None

  

In [10]:
# encode an amino acids sequence into a numerical vector
# MUST use the same transformer for all data without refit
word2index = {}
for amino in train_data_:
  for acid in amino:
    if acid in word2index: continue
    word2index[acid] = len(word2index)

def sentence2index(data):
  return [[word2index[w]  for w in s]for s in data]

# データをバッチでまとめるための関数
def train2batch(title, category, batch_size=10000):
  title_batch = []
  category_batch = []
  title_shuffle, category_shuffle = shuffle(title, category)
  for i in range(0, len(title), batch_size):
    title_batch.append(title_shuffle[i:i+batch_size])
    category_batch.append(category_shuffle[i:i+batch_size])
  return title_batch, category_batch

# extract label information
# Note: NO LABEL INFORMATION for test dataset
train_label = generate_label(train_df)
val_label   = generate_label(val_df)
# test_label = Non
# rename for interpretability
X_train, y_train = sentence2index(train_data_), train_label
X_val,   y_val   = sentence2index(val_data_),   val_label
X_test           = sentence2index(test_data_)

In [11]:
print(torch.cuda.device_count())
# GPUを使うために必要
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

1


In [12]:
class LSTMClassifier(nn.Module):
    # モデルで使う各ネットワークをコンストラクタで定義
      def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。
        self.hidden_dim = hidden_dim
        # インプットの単語をベクトル化するために使う
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # LSTMの隠れ層。これ１つでOK。超便利。 batch_first=Trueが大事！
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.LogSoftmax(dim=1)

    # 順伝播処理はforward関数に記載
      def forward(self, sentence):
        # 文章内の各単語をベクトル化して出力。2次元のテンソル
        embeds = self.word_embeddings(sentence)
        # 2次元テンソルをLSTMに食わせられる様にviewで３次元テンソルにした上でLSTMへ流す。
        # 上記で説明した様にmany to oneのタスクを解きたいので、第二戻り値だけ使う。
        _, lstm_out = self.lstm(embeds)
        # lstm_out[0]は３次元テンソルになってしまっているので2次元に調整して全結合。
        tag_space = self.hidden2tag(lstm_out[0])
        # softmaxに食わせて、確率として表現
        tag_scores = self.softmax(tag_space.squeeze())
        return tag_scores
    

In [16]:
# 単語のベクトル次元数
EMBEDDING_DIM = 10
# 隠れ層の次元数
HIDDEN_DIM = 128
# データ全体の単語数
VOCAB_SIZE = len(word2index)
# 分類先のカテゴリの数
TAG_SIZE = 2
# モデル宣言
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)
# 損失関数はNLLLoss()を使う。LogSoftmaxを使う時はこれを使うらしい。
loss_function = nn.NLLLoss()
# 最適化の手法はadam
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 各エポックの合計loss値を格納する
losses = []

# 100ループ回してみる。
for epoch in range(27):
    
    all_loss = 0
    title_batch, category_batch = train2batch(X_train, y_train)
    for i in range(len(title_batch)):
        batch_loss = 0
        model.zero_grad()
        # 順伝搬させるtensorはGPUで処理させるためdevice=にGPUをセット
        title_tensor = torch.tensor(title_batch[i], device=device)
        # category_tensor.size() = (batch_size × 1)なので、squeeze()
        category_tensor = torch.tensor(category_batch[i], device=device).squeeze()
        
        out = model(title_tensor)
        batch_loss = loss_function(out, category_tensor)
        batch_loss.backward()
        optimizer.step()
        all_loss += batch_loss.item()
    print("epoch", epoch, "\t" , "loss", all_loss)

    if epoch > 20:
      test_num = len(X_val)
      # 正解の件数
      a = 0
      #  勾配自動計算OFF
      with torch.no_grad():
          title_batch, category_batch = train2batch(X_val, y_val)
          allprob = []
          for i in range(len(title_batch)):
            title_tensor = torch.tensor(title_batch[i], device=device)
            category_tensor = torch.tensor(category_batch[i], device=device)
            out = model(title_tensor)
            _, predicts = torch.max(out, 1)
            prob = torch.exp(out[:,1])
            allprob.append(prob)
            for j, ans in enumerate(category_tensor):
                if predicts[j].item() == ans.item():
                    a += 1
      print( a / test_num)
      prob_1 = torch.cat(allprob, 0)
      y_val_batch = np.concatenate(category_batch, axis=0)
      auc = roc_auc_score(y_val_batch , prob_1.to('cpu').detach().numpy().copy()) 
      print(auc)
      
    if all_loss < 0.1: break
print("done.")

epoch 0 	 loss 59.9971039891243
epoch 1 	 loss 56.482780396938324
epoch 2 	 loss 55.3305498957634
epoch 3 	 loss 52.67639145255089
epoch 4 	 loss 48.47841390967369
epoch 5 	 loss 47.09657695889473
epoch 6 	 loss 46.39263787865639
epoch 7 	 loss 45.89206862449646
epoch 8 	 loss 45.33062794804573
epoch 9 	 loss 44.26288402080536
epoch 10 	 loss 43.55077689886093
epoch 11 	 loss 42.96271473169327
epoch 12 	 loss 42.51873764395714
epoch 13 	 loss 42.134491711854935
epoch 14 	 loss 41.76576465368271
epoch 15 	 loss 41.58200490474701
epoch 16 	 loss 41.36629980802536
epoch 17 	 loss 41.18223479390144
epoch 18 	 loss 40.99389007687569
epoch 19 	 loss 40.99745041131973
epoch 20 	 loss 40.76394349336624
0.8331262091484136
0.8569548164306435
epoch 21 	 loss 40.59872505068779
0.835273736564648
0.8571294966193426
epoch 22 	 loss 40.52067109942436
0.8352338196981752
0.8578447450121459
epoch 23 	 loss 40.45892861485481
0.8359549844191164
0.8579443161445832
epoch 24 	 loss 40.28854137659073
0.8333630

In [20]:
###### 4. prediction for test dataset ######

if (test_df is not None) and (X_test is not None):
    with torch.no_grad():
          title_batch = []
          for i in range(0, len(X_test), 10000):
            title_batch.append(X_test[i:i+10000])
          allprob = []
          for i in range(len(title_batch)):
            title_tensor = torch.tensor(title_batch[i], device=device)
            out = model(title_tensor)
            _, predicts = torch.max(out, 1)
            prob = torch.exp(out[:,1])
            allprob.append(prob)
    prob_1 = torch.cat(allprob, 0)
      
    predicted = prob_1.to('cpu').detach().numpy().copy()
    sequence_id_list    = []
    residue_number_list = []
    for _, item in test_df.iterrows():
        sequence_id = item.sequence_id
        sequence    = item.sequence
        for i, aa in enumerate(sequence):
            sequence_id_list.append(sequence_id)
            residue_number_list.append(i+1) #0-origin to 1-origin
    predicted_df = pd.DataFrame.from_dict({
        "sequence_id": sequence_id_list,
        "residue_number": residue_number_list,
        "predicted_value": predicted,
        })

    predicted_df.to_csv('output.csv', index=None)

In [22]:
from google.colab import files
files.download('output.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
test_num = len(X_val)
# 正解の件数
a = 0
#  勾配自動計算OFF
with torch.no_grad():
    title_batch, category_batch = train2batch(X_val, y_val)
    allprob = []
    for i in range(len(title_batch)):
      title_tensor = torch.tensor(title_batch[i], device=device)
      category_tensor = torch.tensor(category_batch[i], device=device)
      out = model(title_tensor)
      _, predicts = torch.max(out, 1)
      prob = torch.exp(out[:,1])
      allprob.append(prob)
      for j, ans in enumerate(category_tensor):
          if predicts[j].item() == ans.item():
              a += 1
print( a / test_num)

0.8365058371764406


In [24]:
prob_1 = torch.cat(allprob, 0)
y_val_batch = np.concatenate(category_batch, axis=0)
auc = roc_auc_score(y_val_batch , prob_1.to('cpu').detach().numpy().copy()) 
print(auc)

0.8590307932335519
