In [1]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import evaluate

In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

In [4]:
global debug
debug = {}

In [5]:
import pandas as pd

train_dataset = pd.read_csv("data/prompt_baseline_data/alcohol_train.csv")
test_dataset = pd.read_csv("data/prompt_baseline_data/alcohol_valid.csv")

train_dataset = train_dataset.iloc[:97]
test_dataset = test_dataset.iloc[:97]

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

OUT_DIM1=4
OUT_DIM2=5

TRAIN Dataset: (97, 4)
TEST Dataset: (97, 4)


In [6]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [7]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.input[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'target1': torch.tensor(self.data.label1[index], dtype=torch.long),
            'target2': torch.tensor(self.data.label2[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [8]:
training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class ModelClass(torch.nn.Module):
    def __init__(self):
        super(ModelClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(MODEL_NAME)
        
        self.pre_classifier1 = torch.nn.Linear(768, 768)
        self.dropout1 = torch.nn.Dropout(0.3)
        self.classifier1 = torch.nn.Linear(768, OUT_DIM1)
        
        self.pre_classifier2 = torch.nn.Linear(768, 768)
        self.dropout2 = torch.nn.Dropout(0.3)
        self.classifier2 = torch.nn.Linear(768, OUT_DIM2)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        
        pooler1 = hidden_state[:, 0]
        pooler1 = self.pre_classifier1(pooler1)
        pooler1 = torch.nn.ReLU()(pooler1)
        pooler1 = self.dropout1(pooler1)
        output1 = self.classifier1(pooler1)
        
        pooler2 = hidden_state[:, 0]
        pooler2 = self.pre_classifier2(pooler2)
        pooler2 = torch.nn.ReLU()(pooler2)
        pooler2 = self.dropout2(pooler2)
        output2 = self.classifier2(pooler2)
        
        return output1, output2
    

In [11]:
model = ModelClass()
model.to(device)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ModelClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [12]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

<a id='section05'></a>
### Fine Tuning the Model

In [13]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    mcc1 = 0
    macro_f1_1 = 0
    micro_f1_1 = 0
    mcc2 = 0
    macro_f1_2 = 0
    micro_f1_2 = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            target1 = data['target1'].to(device, dtype = torch.long)
            target2 = data['target2'].to(device, dtype = torch.long)
            output1, output2 = model(ids, mask)
            
#             target1 = target1.squeeze()
#             target2 = target2.squeeze()
        
#             output1 = output1.squeeze()
#             output2 = output2.squeeze()
            
            loss1 = loss_function(output1, target1)
            loss2 = loss_function(output2, target2)
            loss = loss1+loss2
            tr_loss += loss.item()
            
            big_val1, big_idx1 = torch.max(output1.data, dim=1)
            big_val2, big_idx2 = torch.max(output2.data, dim=1)
            
            matthews_metric = evaluate.load("matthews_correlation")
            mcc_result1 = matthews_metric.compute(references=target1,
                                              predictions=big_idx1)['matthews_correlation']
            mcc_result2 = matthews_metric.compute(references=target2,
                                              predictions=big_idx2)['matthews_correlation']

            f1_metric = evaluate.load("f1")
            macro_f1_result1 = f1_metric.compute(predictions=big_idx1, references=target1, average='macro')['f1']
            micro_f1_result1 = f1_metric.compute(predictions=big_idx1, references=target1, average='micro')['f1']
            
            macro_f1_result2 = f1_metric.compute(predictions=big_idx2, references=target2, average='macro')['f1']
            micro_f1_result2 = f1_metric.compute(predictions=big_idx2, references=target2, average='micro')['f1']

            mcc1 += mcc_result1
            mcc2 += mcc_result2
            macro_f1_1 += macro_f1_result1
            macro_f1_2 += macro_f1_result2
            micro_f1_1 += micro_f1_result1
            micro_f1_2 += micro_f1_result2

            nb_tr_steps += 1
            nb_tr_examples+=target1.size(0)
    
    print("task1: presense")
    print(f'The Total MCC for Epoch {epoch}: {(mcc1*100)/nb_tr_steps}')
    print(f'The Total macro_f1 for Epoch {epoch}: {(macro_f1_1*100)/nb_tr_steps}')
    print(f'The Total micro_f1 for Epoch {epoch}: {(micro_f1_1*100)/nb_tr_steps}')
    
    print("task2: period")
    print(f'The Total MCC for Epoch {epoch}: {(mcc2*100)/nb_tr_steps}')
    print(f'The Total macro_f1 for Epoch {epoch}: {(macro_f1_2*100)/nb_tr_steps}')
    print(f'The Total micro_f1 for Epoch {epoch}: {(micro_f1_2*100)/nb_tr_steps}')
    
    epoch_loss = tr_loss/nb_tr_steps
    print(f"Validation Loss Epoch: {epoch_loss}")
    
    return


In [14]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    global debug
    tr_loss = 0
    mcc1 = 0
    macro_f1_1 = 0
    micro_f1_1 = 0
    mcc2 = 0
    macro_f1_2 = 0
    micro_f1_2 = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        target1 = data['target1'].to(device, dtype = torch.long)
        target2 = data['target2'].to(device, dtype = torch.long)
        output1, output2 = model(ids, mask)
        
        debug['target1'] = target1
        debug['target2'] = target2
        debug['output1'] = output1
        debug['output2'] = output2
        
#         target1 = target1.squeeze()
#         target2 = target2.squeeze()
        
#         output1 = output1.squeeze()
#         output2 = output2.squeeze()

        loss1 = loss_function(output1, target1)
        loss2 = loss_function(output2, target2)
        loss = loss1+loss2
        tr_loss += loss.item()

        big_val1, big_idx1 = torch.max(output1.data, dim=1)
        big_val2, big_idx2 = torch.max(output2.data, dim=1)

        matthews_metric = evaluate.load("matthews_correlation")
        mcc_result1 = matthews_metric.compute(references=target1,
                                          predictions=big_idx1)['matthews_correlation']
        mcc_result2 = matthews_metric.compute(references=target2,
                                          predictions=big_idx2)['matthews_correlation']

        f1_metric = evaluate.load("f1")
        macro_f1_result1 = f1_metric.compute(predictions=big_idx1, references=target1, average='macro')['f1']
        micro_f1_result1 = f1_metric.compute(predictions=big_idx1, references=target1, average='micro')['f1']

        macro_f1_result2 = f1_metric.compute(predictions=big_idx2, references=target2, average='macro')['f1']
        micro_f1_result2 = f1_metric.compute(predictions=big_idx2, references=target2, average='micro')['f1']

        mcc1 += mcc_result1
        mcc2 += mcc_result2
        macro_f1_1 += macro_f1_result1
        macro_f1_2 += macro_f1_result2
        micro_f1_1 += micro_f1_result1
        micro_f1_2 += micro_f1_result2

        nb_tr_steps += 1
        nb_tr_examples+=target1.size(0)

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print("task1: presense")
    print(f'The Total MCC for Epoch {epoch}: {(mcc1*100)/nb_tr_steps}')
    print(f'The Total macro_f1 for Epoch {epoch}: {(macro_f1_1*100)/nb_tr_steps}')
    print(f'The Total micro_f1 for Epoch {epoch}: {(micro_f1_1*100)/nb_tr_steps}')
    
    print("task2: period")
    print(f'The Total MCC for Epoch {epoch}: {(mcc2*100)/nb_tr_steps}')
    print(f'The Total macro_f1 for Epoch {epoch}: {(macro_f1_2*100)/nb_tr_steps}')
    print(f'The Total micro_f1 for Epoch {epoch}: {(micro_f1_2*100)/nb_tr_steps}')
    
    epoch_loss = tr_loss/nb_tr_steps
    print(f"Training Loss Epoch: {epoch_loss}")
    
    print('This is the validation section to print the accuracy and see how it performs')
    print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

    valid(model, testing_loader)
    print('------------------------------------------------------------------')

    return 

In [15]:
for epoch in range(EPOCHS):
    train(epoch)

task1: presense
The Total MCC for Epoch 0: 3.889381488742141
The Total macro_f1 for Epoch 0: 22.401263266647884
The Total micro_f1 for Epoch 0: 32.69230769230769
task2: period
The Total MCC for Epoch 0: 4.015294067677567
The Total macro_f1 for Epoch 0: 26.750571365955985
The Total micro_f1 for Epoch 0: 40.38461538461539
Training Loss Epoch: 2.862440586090088
This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
task1: presense
The Total MCC for Epoch 0: -3.6491933878387277
The Total macro_f1 for Epoch 0: 15.263070263070263
The Total micro_f1 for Epoch 0: 22.115384615384617
task2: period
The Total MCC for Epoch 0: 6.3271582623203395
The Total macro_f1 for Epoch 0: 14.253108003108004
The Total micro_f1 for Epoch 0: 25.0
Validation Loss Epoch: 2.8067535620469313
------------------------------------------------------------------


In [16]:
task1: presense
The Total MCC for Epoch 0: 7.039135250263674
The Total macro_f1 for Epoch 0: 28.80850630850631
The Total micro_f1 for Epoch 0: 34.61538461538461
task2: period
The Total MCC for Epoch 0: 7.2460908384321625
The Total macro_f1 for Epoch 0: 28.4947274947275
The Total micro_f1 for Epoch 0: 37.5
Training Loss Epoch: 2.9264204502105713
This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
task1: presense
The Total MCC for Epoch 0: 3.774527286173485
The Total macro_f1 for Epoch 0: 22.77093277093277
The Total micro_f1 for Epoch 0: 30.76923076923077
task2: period
The Total MCC for Epoch 0: 9.494652962167164
The Total macro_f1 for Epoch 0: 23.928340178340175
The Total micro_f1 for Epoch 0: 36.53846153846154
Validation Loss Epoch: 2.826401252012986
------------------------------------------------------------------
​


SyntaxError: invalid syntax (<ipython-input-16-f0a3a850b68a>, line 2)

In [None]:
task1: presense
The Total MCC for Epoch 0: 1.5186428224958337
The Total macro_f1 for Epoch 0: 25.67104192104192
The Total micro_f1 for Epoch 0: 33.65384615384615
task2: period
The Total MCC for Epoch 0: 9.016377922467722
The Total macro_f1 for Epoch 0: 20.456117956117957
The Total micro_f1 for Epoch 0: 32.69230769230769
Training Loss Epoch: 2.9077738431783824
This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
task1: presense
The Total MCC for Epoch 0: 7.367690333612365
The Total macro_f1 for Epoch 0: 23.28856328856329
The Total micro_f1 for Epoch 0: 31.73076923076923
task2: period
The Total MCC for Epoch 0: 5.777918830654928
The Total macro_f1 for Epoch 0: 15.614524364524366
The Total micro_f1 for Epoch 0: 26.923076923076923
Validation Loss Epoch: 2.8747668266296387
------------------------------------------------------------------