## Setting the environment for Colab

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# %cd "/content/drive/My Drive/Colab Notebooks/w266_final/project_re"

/content/drive/.shortcut-targets-by-id/1I3W7Z7rz_YfsjBjX7z4zTzGbr-iKTrmD/w266_final/project_re


In [3]:
# !pip install transformers



In [1]:
%reload_ext autoreload
%matplotlib inline
import logging
import time
from platform import python_version
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from sklearn.metrics import roc_auc_score
from torch.autograd import Variable

In [2]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

In [4]:
df_train = pd.read_csv('data_divided/train.tsv', sep ="\t", header=None)
df_train = df_train.rename(columns={0: "id", 1: "relation_code", 2: "alpha", 3:"string"})
df_train = df_train[['id', 'string', 'relation_code' ]]
df_train = df_train.dropna(subset=['string'])
df_train = pd.get_dummies(df_train, columns = ['relation_code'])
df_train = df_train.rename(columns={"relation_code_0": "reason", 
                                    "relation_code_1": "route", 
                                    "relation_code_2": "strength", 
                                    "relation_code_3": "frequency", 
                                    "relation_code_4": "duration", 
                                    "relation_code_5": "form",
                                    "relation_code_6": "dosage",
                                    "relation_code_7": "ade",
                                    "relation_code_8": "no_relation"})

print(len(df_train))

61012


In [5]:
df_train.head()

Unnamed: 0,id,string,reason,route,strength,frequency,duration,form,dosage,ade,no_relation
0,0,He also may have SUB_B recurrent seizures SUB_...,1,0,0,0,0,0,0,0,0
1,1,He also may have recurrent seizures which shou...,0,1,0,0,0,0,0,0,0
2,2,He also may have recurrent seizures which shou...,0,1,0,0,0,0,0,0,0
3,3,-patient will be on OBJ_B Topiramate OBJ_E SUB...,0,0,1,0,0,0,0,0,0
4,4,-patient will be on OBJ_B Topiramate OBJ_E 25m...,0,1,0,0,0,0,0,0,0


In [6]:
df_val = pd.read_csv('data_divided/dev.tsv', sep ="\t", header=None)
df_val = df_val.rename(columns={0: "id", 1: "relation_code", 2: "alpha", 3:"string"})
df_val = df_val[['id', 'string', 'relation_code' ]]
df_val = df_val.dropna(subset=['string'])
df_val = pd.get_dummies(df_val, columns = ['relation_code'])
df_val = df_val.rename(columns={"relation_code_0": "reason", 
                                    "relation_code_1": "route", 
                                    "relation_code_2": "strength", 
                                    "relation_code_3": "frequency", 
                                    "relation_code_4": "duration", 
                                    "relation_code_5": "form",
                                    "relation_code_6": "dosage",
                                    "relation_code_7": "ade",
                                    "relation_code_8": "no_relation"})

print(len(df_val))

15303


In [7]:
#df_val.head()

In [8]:
df_test = pd.read_csv('data_divided/test.tsv', sep ="\t", header=None)
len(df_test)

1000

In [9]:
df_test = df_test.rename(columns={0: "id", 1: "relation_code", 2: "alpha", 3:"string"})
df_test = df_test[['id', 'string', 'relation_code' ]]
df_test = df_test.dropna(subset=['string'])
df_test = pd.get_dummies(df_test, columns = ['relation_code'])
df_test = df_test.rename(columns={"relation_code_0": "reason", 
                                    "relation_code_1": "route", 
                                    "relation_code_2": "strength", 
                                    "relation_code_3": "frequency", 
                                    "relation_code_4": "duration", 
                                    "relation_code_5": "form",
                                    "relation_code_6": "dosage",
                                    "relation_code_7": "ade",
                                    "relation_code_8": "no_relation"})


print(len(df_test))

1000


In [10]:
#df_test.head()

In [11]:
target_columns = ["reason", "route", "strength", "frequency", "duration", "form", "dosage", "ade", "no_relation"]

## BERT

In [12]:
model_class = transformers.BertModel
tokenizer_class = transformers.BertTokenizer
pretrained_weights='gsarti/biobert-nli'
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
bert_model = model_class.from_pretrained(pretrained_weights).to(device)
#bert_model = model_class.from_pretrained(pretrained_weights)

In [13]:
max_seq = 256
def tokenize_text(df, max_seq):
    return [
        tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.string.values
    ]
def pad_text(tokenized_text, max_seq):
    return torch.Tensor(np.array([el + [0] * (max_seq - len(el)) for el in tokenized_text])).type(torch.LongTensor).to(device)
def tokenize_and_pad_text(df, max_seq):
    tokenized_text = tokenize_text(df, max_seq)
    padded_text = pad_text(tokenized_text, max_seq)
    return torch.tensor(padded_text)
def targets_to_tensor(df, target_columns):
    return torch.tensor(df[target_columns].values, dtype=torch.float32)


In [None]:
train_indices = tokenize_and_pad_text(df_train, max_seq)
print(type(train_indices))

val_indices = tokenize_and_pad_text(df_val, max_seq)
test_indices = tokenize_and_pad_text(df_test, max_seq)
with torch.no_grad():
    x_train = bert_model(train_indices)[0]  
    x_val = bert_model(val_indices)[0]
    x_test = bert_model(test_indices)[0]
y_train = targets_to_tensor(df_train, target_columns)
y_val = targets_to_tensor(df_val, target_columns)
y_test = targets_to_tensor(df_test, target_columns)

  return torch.tensor(padded_text)


<class 'torch.Tensor'>


In [None]:
#df_train['string'].isnull().values.sum()

In [None]:
x_train[0]

In [None]:
y_train[0]

In [20]:
class KimCNN(nn.Module):
    def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout, static):
        super(KimCNN, self).__init__()
        V = embed_num
        D = embed_dim
        C = class_num
        Co = kernel_num
        Ks = kernel_sizes
        
        self.static = static
        self.embed = nn.Embedding(V, D)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(Ks) * Co, C)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        if self.static:
            x = Variable(x)
        print('shape1: ', x.shape)
        x = x.unsqueeze(1)  # (N, Ci, W, D)
        print('shape2: ', x.shape)
        x = [F.relu(conv(x)).squeeze(3).to(device) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        shapes =  [y.shape for y in x]
        print('shape after Relu: ',shapes)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        shapes =  [y.shape for y in x]
        print('shape after maxpool: ',shapes)
        x = torch.cat(x, 1)
        print('shape after concat: ',x.shape)
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        output = self.sigmoid(logit)
        return output

In [21]:
embed_num = x_train.shape[1]
embed_dim = x_train.shape[2]
class_num = y_train.shape[1]
kernel_num = 3
kernel_sizes = [2, 3, 4]
dropout = 0.5
static = True
model = KimCNN(
    embed_num=embed_num,
    embed_dim=embed_dim,
    class_num=class_num,
    kernel_num=kernel_num,
    kernel_sizes=kernel_sizes,
    dropout=dropout,
    static=static,
)

In [22]:
n_epochs = 5
batch_size = 12
lr = 0.00001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCELoss()

In [23]:
def generate_batch_data(x, y, batch_size):
    i, batch = 0, 0
    for batch, i in enumerate(range(0, len(x) - batch_size, batch_size), 1):
        x_batch = x[i : i + batch_size]
        y_batch = y[i : i + batch_size]
        yield x_batch.to(device), y_batch.to(device), batch
    if i + batch_size < len(x):
        yield x[i + batch_size :].to(device), y[i + batch_size :].to(device), batch + 1
    if batch == 0:
        yield x.to(device), y.to(device), 1

#generate_batch_data

In [24]:
train_losses, val_losses = [], []
for epoch in range(n_epochs):
  start_time = time.time()
  train_loss = 0
  model.train(True).to(device)
  for x_batch, y_batch, batch in generate_batch_data(x_train, y_train, batch_size):
    #print(type(x_batch))
    y_pred = model(x_batch).to(device)
    #print(type(y_pred))
    optimizer.zero_grad()
    loss = loss_fn(y_pred, y_batch)
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
  train_loss /= batch
  train_losses.append(train_loss)
  elapsed = time.time() - start_time
  
  model.eval() # disable dropout for deterministic output
# deactivate autograd engine to reduce memory usage and speed up computations
  with torch.no_grad():
      val_loss, batch = 0, 1
      for x_batch, y_batch, batch in generate_batch_data(x_val, y_val, batch_size):
          y_pred = model(x_batch)
          loss = loss_fn(y_pred, y_batch).to(device)
          val_loss += loss.item()
      val_loss /= batch
      val_losses.append(val_loss)
      print(
          "Epoch %d Train loss: %.2f. Validation loss: %.2f. Elapsed time: %.2fs."
          % (epoch + 1, train_losses[-1], val_losses[-1], elapsed)
      )


Epoch 1 Train loss: 0.71. Validation loss: 0.69. Elapsed time: 0.35s.
Epoch 2 Train loss: 0.69. Validation loss: 0.66. Elapsed time: 0.36s.
Epoch 3 Train loss: 0.67. Validation loss: 0.65. Elapsed time: 0.35s.
Epoch 4 Train loss: 0.67. Validation loss: 0.64. Elapsed time: 0.34s.
Epoch 5 Train loss: 0.66. Validation loss: 0.64. Elapsed time: 0.34s.
