In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

# **Setup**

---



In [None]:
!pip install transformers >> /dev/null

In [None]:
import pandas as pd
import numpy as np

import tensorflow as tf
import torch

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import re
import pickle
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score

from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
threshold = 0.20

# **D-subtask** *English*: **Data Loader**

X_train, X_val, X_test: list of *input text data*

Y_train, Y_val, Y_test: list of one-hot encoded *labels*

---




In [None]:
df_train = pd.read_csv('drive/MyDrive/CodiEsp/train/trainD.tsv', sep = '\t', header = None)
df_train.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('drive/MyDrive/CodiEsp/dev/devD.tsv', sep = '\t', header = None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('drive/MyDrive/CodiEsp/test/testD.tsv', sep = '\t', header = None)
df_test.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

In [None]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(
    ids), "\nNumber of ICD10 codes:", len(codes))

In [None]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [None]:
def remstopwords(text, stopwords):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    return " ".join([i for i in text.split() if i not in stopwords])

stop_words = stopwords.words('english')

In [None]:
X_train = []
Y_train = []

for id in (df_train['Id'].unique()):
  Y_train.append(id2label[id])

  with open('drive/MyDrive/CodiEsp/train/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_train.append(remstopwords(text.lower(), stop_words))
with open("drive/MyDrive/X_train.txt", "wb") as fp:
  pickle.dump(X_train, fp)
with open("drive/MyDrive/X_train.txt", "rb") as fp:
  X_train = pickle.load(fp)

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])

  with open('drive/MyDrive/CodiEsp/dev/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_val.append(remstopwords(text.lower(), stop_words))
with open("drive/MyDrive/X_val.txt", "wb") as fp:
  pickle.dump(X_val, fp)
with open("drive/MyDrive/X_val.txt", "rb") as fp:
  X_val = pickle.load(fp)

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

  with open('drive/MyDrive/CodiEsp/test/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_test.append(remstopwords(text.lower(), stop_words))
with open("drive/MyDrive/X_test.txt", "wb") as fp:
  pickle.dump(X_test, fp)
with open("drive/MyDrive/X_test.txt", "rb") as fp:
  X_test = pickle.load(fp)

In [None]:
p_code = [0]*len(codes)
for label in Y_train:
  for i, code in enumerate(label):
    if (code == 1):
      p_code[i] = 1

not_present = 0
for i, present in enumerate(p_code):
  if (present == 0):
    not_present += 1

print("Number of classes NOT PRESENT in training dataset:", not_present)

# **D-subtask** *English*: **BERT Model**

---

In [None]:
from transformers import AdamW
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = len(codes))

In [None]:
encodings_train = tokenizer.batch_encode_plus(X_train, max_length = 512, padding = True, truncation = True)
encodings_val = tokenizer.batch_encode_plus(X_val, max_length = 512, padding = True, truncation = True)
encodings_test = tokenizer.batch_encode_plus(X_test, max_length = 512, padding = True, truncation = True)

print('tokenizer outputs: ', encodings_train.keys())

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(encodings_train['input_ids'])
train_labels = torch.tensor(Y_train)
train_masks = torch.tensor(encodings_train['attention_mask'])
train_token_types = torch.tensor(encodings_train['token_type_ids'])

val_inputs = torch.tensor(encodings_val['input_ids'])
val_labels = torch.tensor(Y_val)
val_masks = torch.tensor(encodings_val['attention_mask'])
val_token_types = torch.tensor(encodings_val['token_type_ids'])

test_inputs = torch.tensor(encodings_test['input_ids'])
test_labels = torch.tensor(Y_test)
test_masks = torch.tensor(encodings_test['attention_mask'])
test_token_types = torch.tensor(encodings_test['token_type_ids'])

In [None]:
batch_size = 6

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels, val_token_types)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size = batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size = batch_size)

In [None]:
model.cuda(device = device)

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5)  # Default optimization

In [None]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
def hamming_score(y_true, y_pred):
    ''' Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
        http://stackoverflow.com/q/32239577/395857 '''

    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)   

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []
max_val_f1score = 0.0

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc = "Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch) # Add batch to GPU
    b_input_ids, b_input_mask, b_labels, b_token_types = batch # Unpack the inputs from our dataloader
    optimizer.zero_grad() # Clear out the gradients (by default they accumulate)

    outputs = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask) # Forward pass for multilabel classification

    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1, len(codes)), b_labels.type_as(logits).view(-1, len(codes))) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    loss.backward()
    optimizer.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

  for i, batch in enumerate(val_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    
    with torch.no_grad():
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  pred_bools = [pl > threshold for pl in pred_labels]
  true_bools = [tl == 1 for tl in true_labels]

  val_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)
  val_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

  # if (max_val_f1score <= val_f1_accuracy):
  #   torch.save(model.state_dict(), 'drive/MyDrive/BEST_BertModel.pt')

  print('\nF1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)
  print("Validation Hamming Score ", val_hamming_score)

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/BEST_BertModelD.pt')

In [None]:
model.load_state_dict(torch.load('drive/MyDrive/BEST_BertModelD.pt'))
model.eval()

# Variables to gather full output
logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  
  with torch.no_grad():
    outs = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Calculate Accuracy
pred_bools = [pl > threshold for pl in pred_labels]
true_bools = [tl == 1 for tl in true_labels]

test_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
test_flat_accuracy = accuracy_score(true_bools, pred_bools)
test_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

print('\nF1 Test Accuracy: ', test_f1_accuracy)
print('Flat Test Accuracy: ', test_flat_accuracy)
print("Test Hamming Score ", test_hamming_score)

# **D-subtask** *English*: **Bio_Clinical BERT Model**

---


In [None]:
def hamming_score(y_true, y_pred):
    ''' Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
        http://stackoverflow.com/q/32239577/395857 '''

    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)   

In [None]:
from transformers import AdamW
from transformers import AutoTokenizer, AutoModel

In [None]:
len(X_train)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

encodings_train = tokenizer.batch_encode_plus(X_train, max_length = 512, padding = True, truncation = True)
encodings_val = tokenizer.batch_encode_plus(X_val, max_length = 512, padding = True, truncation = True)
encodings_test = tokenizer.batch_encode_plus(X_test, max_length = 512, padding = True, truncation = True)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(encodings_train['input_ids'])
train_labels = torch.tensor(Y_train)
train_masks = torch.tensor(encodings_train['attention_mask'])
# train_token_types = torch.tensor(encodings_train['token_type_ids'])

val_inputs = torch.tensor(encodings_val['input_ids'])
val_labels = torch.tensor(Y_val)
val_masks = torch.tensor(encodings_val['attention_mask'])
# val_token_types = torch.tensor(encodings_val['token_type_ids'])

test_inputs = torch.tensor(encodings_test['input_ids'])
test_labels = torch.tensor(Y_test)
test_masks = torch.tensor(encodings_test['attention_mask'])
# test_token_types = torch.tensor(encodings_test['token_type_ids'])

In [None]:
batch_size = 5

# train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

# val_data = TensorDataset(val_inputs, val_masks, val_labels, val_token_types)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

# test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
class BioBERT(torch.nn.Module):
  def __init__(self):
    super(BioBERT, self).__init__()
    self.model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    self.linear_1 = torch.nn.Linear(768*512, 512)
    self.linear_2 = torch.nn.Linear(512, len(codes))

    for param in self.model.base_model.parameters():
      param.requires_grad = False

  def forward(self, ids, mask):
    output = self.model(ids, attention_mask = mask)
    linear_output_1 = torch.relu(self.linear_1(output[0].view(batch_size, -1)))
    linear_output_2 = torch.sigmoid(self.linear_2(linear_output_1))

    return linear_output_2

In [None]:
model = BioBERT()
model.to(torch.device("cuda"))
# criterion = BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []
max_val_f1score = 0.0

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc = "Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  # model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch) # Add batch to GPU
    # b_input_ids, b_input_mask, b_labels, b_token_types = batch # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch # Unpack the inputs from our dataloader
    optimizer.zero_grad() # Clear out the gradients (by default they accumulate)
    output = model(b_input_ids, b_input_mask) # Forward pass for multilabel classification
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(output.view(-1, len(codes)), b_labels.type_as(output).view(-1, len(codes))) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    loss.backward()
    optimizer.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))

In [None]:
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

  for i, batch in enumerate(val_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # b_input_ids, b_input_mask, b_labels, b_token_types = batch
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      b_logit_pred = model(b_input_ids, b_input_mask) # Forward pass for multilabel classification
      pred_label = b_logit_pred

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  pred_bools = [pl > threshold for pl in pred_labels]
  true_bools = [tl == 1 for tl in true_labels]

  val_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)
  val_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

  # if (max_val_f1score <= val_f1_accuracy):
  #   torch.save(model.state_dict(), 'gdrive/MyDrive/BEST_BertModel.pt')

  print('\nF1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)
  print("Validation Hamming Score ", val_hamming_score)

In [None]:
  # Testing

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

  for i, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # b_input_ids, b_input_mask, b_labels, b_token_types = batch
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      b_logit_pred = model(b_input_ids, b_input_mask) # Forward pass for multilabel classification
      pred_label = b_logit_pred

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  pred_bools = [pl > threshold for pl in pred_labels]
  true_bools = [tl == 1 for tl in true_labels]

  test_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  test_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

  # if (max_val_f1score <= val_f1_accuracy):
  #   torch.save(model.state_dict(), 'gdrive/MyDrive/BEST_BertModel.pt')

  print('\nF1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)
  print("Validation Hamming Score ", val_hamming_score)

# **D-subtask** *English*: **BioBERT Model for Discharge Summaries**

In [None]:
def hamming_score(y_true, y_pred):
    ''' Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
        http://stackoverflow.com/q/32239577/395857 '''

    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)   

In [None]:
from transformers import AdamW
from transformers import AutoTokenizer, AutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")

encodings_train = tokenizer.batch_encode_plus(X_train, max_length = 512, padding = True, truncation = True)
encodings_val = tokenizer.batch_encode_plus(X_val, max_length = 512, padding = True, truncation = True)
encodings_test = tokenizer.batch_encode_plus(X_test, max_length = 512, padding = True, truncation = True)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(encodings_train['input_ids'])
train_labels = torch.tensor(Y_train)
train_masks = torch.tensor(encodings_train['attention_mask'])
# train_token_types = torch.tensor(encodings_train['token_type_ids'])

val_inputs = torch.tensor(encodings_val['input_ids'])
val_labels = torch.tensor(Y_val)
val_masks = torch.tensor(encodings_val['attention_mask'])
# val_token_types = torch.tensor(encodings_val['token_type_ids'])

test_inputs = torch.tensor(encodings_test['input_ids'])
test_labels = torch.tensor(Y_test)
test_masks = torch.tensor(encodings_test['attention_mask'])
# test_token_types = torch.tensor(encodings_test['token_type_ids'])

In [None]:
batch_size = 5

# train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

# val_data = TensorDataset(val_inputs, val_masks, val_labels, val_token_types)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

# test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
class BioBERT(torch.nn.Module):
  def __init__(self):
    super(BioBERT, self).__init__()
    self.model = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")
    self.linear_1 = torch.nn.Linear(768*512, 700)
    self.linear_2 = torch.nn.Linear(700, len(codes))

    for param in self.model.base_model.parameters():
      param.requires_grad = False

  def forward(self, ids, mask):
    output = self.model(ids, attention_mask = mask)
    linear_output_1 = torch.relu(self.linear_1(output[0].view(batch_size, -1)))
    linear_output_2 = torch.sigmoid(self.linear_2(linear_output_1))

    return linear_output_2

In [None]:
model = BioBERT()
model.to(torch.device("cuda"))
# criterion = BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []
max_val_f1score = 0.0

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc = "Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  # model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch) # Add batch to GPU
    # b_input_ids, b_input_mask, b_labels, b_token_types = batch # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch # Unpack the inputs from our dataloader
    optimizer.zero_grad() # Clear out the gradients (by default they accumulate)
    output = model(b_input_ids, b_input_mask) # Forward pass for multilabel classification
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(output.view(-1, len(codes)), b_labels.type_as(output).view(-1, len(codes))) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    loss.backward()
    optimizer.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))

  model.eval()

  # Variables to gather full output
  logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

  for i, batch in enumerate(val_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # b_input_ids, b_input_mask, b_labels, b_token_types = batch
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      b_logit_pred = model(b_input_ids, b_input_mask) # Forward pass for multilabel classification
      pred_label = b_logit_pred

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  pred_bools = [pl > threshold for pl in pred_labels]
  true_bools = [tl == 1 for tl in true_labels]

  val_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)
  val_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

  print('\nF1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)
  print("Validation Hamming Score ", val_hamming_score)

In [None]:
  # Testing

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

  for i, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # b_input_ids, b_input_mask, b_labels, b_token_types = batch
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      b_logit_pred = model(b_input_ids, b_input_mask) # Forward pass for multilabel classification
      pred_label = b_logit_pred

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  pred_bools = [pl > threshold for pl in pred_labels]
  true_bools = [tl == 1 for tl in true_labels]

  test_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  test_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

  # if (max_val_f1score <= val_f1_accuracy):
  #   torch.save(model.state_dict(), 'gdrive/MyDrive/BEST_BertModel.pt')

  print('\nF1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)
  print("Validation Hamming Score ", val_hamming_score)

# **P-subtask** *English*: **Data Loader**

X_train, X_val, X_test: list of *input text data*

Y_train, Y_val, Y_test: list of one-hot encoded *labels*

---

In [None]:
df_train = pd.read_csv('drive/MyDrive/CodiEsp/train/trainP.tsv', sep = '\t', header = None)
df_train.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('drive/MyDrive/CodiEsp/dev/devP.tsv', sep = '\t', header = None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('drive/MyDrive/CodiEsp/test/testP.tsv', sep = '\t', header = None)
df_test.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

In [None]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(
    ids), "\nNumber of ICD10 codes:", len(codes))

In [None]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [None]:
def remstopwords(text, stopwords):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    return " ".join([i for i in text.split() if i not in stopwords])

stop_words = stopwords.words('english')

In [None]:
X_train = []
Y_train = []

for id in (df_train['Id'].unique()):
  Y_train.append(id2label[id])
  with open('drive/MyDrive/CodiEsp/train/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_train.append(remstopwords(text.lower(), stop_words))


with open("drive/MyDrive/X_train_P.txt", "wb") as fp:
  pickle.dump(X_train, fp)

with open("drive/MyDrive/X_train_P.txt", "rb") as fp:
  X_train = pickle.load(fp)

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])
  with open('drive/MyDrive/CodiEsp/dev/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_val.append(remstopwords(text.lower(), stop_words))


with open("drive/MyDrive/X_val_P.txt", "wb") as fp:
  pickle.dump(X_val, fp)
with open("drive/MyDrive/X_val_P.txt", "rb") as fp:
  X_val = pickle.load(fp)

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

  with open('drive/MyDrive/CodiEsp/test/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_test.append(remstopwords(text.lower(), stop_words))


with open("drive/MyDrive/X_test_P.txt", "wb") as fp:
  pickle.dump(X_test, fp)
with open("drive/MyDrive/X_test_P.txt", "rb") as fp:
  X_test = pickle.load(fp)

In [None]:
p_code = [0]*len(codes)
for label in Y_train:
  for i, code in enumerate(label):
    if (code == 1):
      p_code[i] = 1

not_present = 0
for i, present in enumerate(p_code):
  if (present == 0):
    not_present += 1

print("Number of classes NOT PRESENT in training dataset:", not_present)

# **P-subtask** *English*: **BERT Model**

In [None]:
from transformers import AdamW
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = len(codes))

In [None]:
encodings_train = tokenizer.batch_encode_plus(X_train, max_length = 512, padding = True, truncation = True)
encodings_val = tokenizer.batch_encode_plus(X_val, max_length = 512, padding = True, truncation = True)
encodings_test = tokenizer.batch_encode_plus(X_test, max_length = 512, padding = True, truncation = True)

print('tokenizer outputs: ', encodings_train.keys())

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(encodings_train['input_ids'])
train_labels = torch.tensor(Y_train)
train_masks = torch.tensor(encodings_train['attention_mask'])
train_token_types = torch.tensor(encodings_train['token_type_ids'])

val_inputs = torch.tensor(encodings_val['input_ids'])
val_labels = torch.tensor(Y_val)
val_masks = torch.tensor(encodings_val['attention_mask'])
val_token_types = torch.tensor(encodings_val['token_type_ids'])

test_inputs = torch.tensor(encodings_test['input_ids'])
test_labels = torch.tensor(Y_test)
test_masks = torch.tensor(encodings_test['attention_mask'])
test_token_types = torch.tensor(encodings_test['token_type_ids'])

In [None]:
batch_size = 5

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels, val_token_types)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size = batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size = batch_size)

In [None]:
model.cuda(device = device);

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5)  # Default optimization

In [None]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
def hamming_score(y_true, y_pred):
    ''' Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
        http://stackoverflow.com/q/32239577/395857 '''

    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)   

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []
max_val_f1score = 0.0

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc = "Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch) # Add batch to GPU
    b_input_ids, b_input_mask, b_labels, b_token_types = batch # Unpack the inputs from our dataloader
    optimizer.zero_grad() # Clear out the gradients (by default they accumulate)

    outputs = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask) # Forward pass for multilabel classification

    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1, len(codes)), b_labels.type_as(logits).view(-1, len(codes))) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    loss.backward()
    optimizer.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

  for i, batch in enumerate(val_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    
    with torch.no_grad():
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  pred_bools = [pl > threshold for pl in pred_labels]
  true_bools = [tl == 1 for tl in true_labels]

  val_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)
  val_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

  # if (max_val_f1score <= val_f1_accuracy):
  #   torch.save(model.state_dict(), 'drive/MyDrive/BEST_BertModel.pt')

  print('\nF1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)
  print("Validation Hamming Score ", val_hamming_score)

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/BEST_BertModelP.pt')

In [None]:
# model.load_state_dict(torch.load('drive/MyDrive/BEST_BertModelP.pt'))
model.eval()

# Variables to gather full output
logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  
  with torch.no_grad():
    outs = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Calculate Accuracy
pred_bools = [pl > threshold for pl in pred_labels]
true_bools = [tl == 1 for tl in true_labels]

test_f1_accuracy = f1_score(true_bools, pred_bools,average = 'macro')
test_flat_accuracy = accuracy_score(true_bools, pred_bools)
test_hamming_score = hamming_score(np.array(true_bools), np.array(pred_bools))

print('\nF1 Test Accuracy: ', test_f1_accuracy)
print('Flat Test Accuracy: ', test_flat_accuracy)
print("Test Hamming Score ", test_hamming_score)

# References


https://towardsdatascience.com/transformers-for-multilabel-classification-71a1a0daf5e1

https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT

http://stackoverflow.com/q/32239577/395857
