<a href="https://colab.research.google.com/github/sahithikodali1/Summarization-of-Biomedical-evidence/blob/main/BIOASQ_Bert_Baseline_8b_b5_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
import json
import csv
import pickle
import matplotlib.pyplot as plt
% matplotlib inline

# install
!pip install pytorch-pretrained-bert pytorch-nlp

# BERT imports
import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange



In [2]:
#Code to split data based on qid

DATA_DIR = "/content/drive/MyDrive/Thesis_B"
file = pd.read_csv(os.path.join(DATA_DIR,'8b_data.csv'))

f_values = file.values
q_id_split = np.split(f_values, np.where(np.diff(f_values[:,0]))[0]+1)
file[:10]

Unnamed: 0,qid,pubmedid,sentid,N1,N2,L,S4,SU4,sentence text,SU4_labels,question
0,0,55031181e9bde69634000014,20,0.5814,0.57143,0.51219,0.44999,0.52892,The non-Mendelian inheritance of sporadic non-...,1,Is Hirschsprung disease a mendelian or a multi...
1,0,55031181e9bde69634000014,3,0.5814,0.57143,0.51219,0.44999,0.52892,The non-Mendelian inheritance of sporadic non-...,1,Is Hirschsprung disease a mendelian or a multi...
2,0,55031181e9bde69634000014,16,0.38961,0.26667,0.21917,0.16902,0.25117,The majority of the identified genes are relat...,1,Is Hirschsprung disease a mendelian or a multi...
3,0,55031181e9bde69634000014,2,0.38961,0.26667,0.21917,0.16902,0.25117,The majority of the identified genes are relat...,1,Is Hirschsprung disease a mendelian or a multi...
4,0,55031181e9bde69634000014,1,0.41379,0.18823,0.09639,0.04938,0.2,"In this study, we review the identification of...",1,Is Hirschsprung disease a mendelian or a multi...
5,0,55031181e9bde69634000014,17,0.41667,0.14894,0.04348,0.0,0.18015,In the etiology of Hirschsprung disease variou...,0,Is Hirschsprung disease a mendelian or a multi...
6,0,55031181e9bde69634000014,14,0.41667,0.14894,0.04348,0.0,0.18015,In the etiology of Hirschsprung disease variou...,0,Is Hirschsprung disease a mendelian or a multi...
7,0,55031181e9bde69634000014,6,0.32143,0.09091,0.05556,0.03774,0.1375,"Furthermore, mutations in the RET gene are res...",0,Is Hirschsprung disease a mendelian or a multi...
8,0,55031181e9bde69634000014,5,0.27659,0.10869,0.08889,0.06818,0.12406,"RET, GDNF, EDNRB, EDN3, and SOX10 lead to long...",0,Is Hirschsprung disease a mendelian or a multi...
9,0,55031181e9bde69634000014,9,0.31915,0.13043,0.04445,0.02273,0.09399,Hirschsprung disease (HSCR) is a multifactori...,0,Is Hirschsprung disease a mendelian or a multi...


In [3]:
#Predict train and test sizes

train_data_size = round(len(q_id_split)*0.8)
val_data_size = round(len(q_id_split) - train_data_size)
print(train_data_size)
print(val_data_size)

2594
648


In [4]:
#Shuffle qid randomly

import random
random.seed(3007)
random.shuffle(q_id_split)
print(type(q_id_split[:1]))

<class 'list'>


In [5]:
#Split train, val data

train_data = q_id_split[:2594]
val_data = q_id_split[2594:]
# test_data = q_id_split[2994:]
f_values_train = np.concatenate(train_data, axis=0)
f_values_val = np.concatenate(val_data, axis=0)
# f_values_test = np.concatenate(test_data, axis=0)
col_names = file.columns
print(col_names)
print(len(train_data))
print(len(val_data))

Index(['qid', 'pubmedid', 'sentid', 'N1', 'N2', 'L', 'S4', 'SU4',
       'sentence text', 'SU4_labels', 'question'],
      dtype='object')
2594
648


In [6]:
#Convert train,val,test data into dataframes

train_df = pd.DataFrame(f_values_train, columns = col_names)
val_df = pd.DataFrame(f_values_val, columns = col_names)
# test_df = pd.DataFrame(f_values_test, columns = col_names)

In [7]:
print(len(train_data))
print(len(val_data))
# print(len(test_data))

print(len(train_df))
print(len(val_df))
# print(len(test_df))
print(train_df.columns.values)

2594
648
41583
10161
['qid' 'pubmedid' 'sentid' 'N1' 'N2' 'L' 'S4' 'SU4' 'sentence text'
 'SU4_labels' 'question']


In [8]:
def obtain_specialtokenized_list(dataframe):
  sentences = dataframe['sentence text']
  questions = dataframe['question']
  sentences_list = list(sentences)
  questions_list = list(questions)
  question_sentence_list = []
  for i in range(len(sentences_list)):
    question_sentence_list = question_sentence_list + ["[CLS] " + questions_list[i] + " [SEP] " + sentences_list[i] + " [SEP]"]
  return question_sentence_list

In [9]:
train_specialtok_list = obtain_specialtokenized_list(train_df)  
val_specialtok_list = obtain_specialtokenized_list(val_df)  
# test_specialtok_list = obtain_specialtokenized_list(test_df)  

In [10]:
def obtain_SU4labels_list(dataframe):
  SU4_labels = dataframe['SU4_labels']
  labels_list = list(SU4_labels)
  return labels_list

In [11]:
train_labels = obtain_SU4labels_list(train_df)
val_labels = obtain_SU4labels_list(val_df)
# test_labels = obtain_SU4labels_list(test_df)

In [12]:
# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [13]:
# Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [14]:
def tokenize_sent(givenlist):
  tokenized_texts = [tokenizer.tokenize(sent) for sent in givenlist]
  return tokenized_texts

In [15]:
train_tokenized = tokenize_sent(train_specialtok_list)
val_tokenized = tokenize_sent(val_specialtok_list)
# test_tokenized = tokenize_sent(test_specialtok_list)

In [16]:
print(type(train_tokenized))
print(len(train_tokenized))
print(train_tokenized[0])

<class 'list'>
41583
['[CLS]', 'what', 'type', 'of', 'mutation', 'is', 'causing', 'the', 'industrial', 'mel', '##ani', '##sm', 'ph', '##eno', '##type', 'in', 'pepper', '##ed', 'moths', '?', '[SEP]', 'here', 'we', 'show', 'that', 'the', 'mutation', 'event', 'giving', 'rise', 'to', 'industrial', 'mel', '##ani', '##sm', 'in', 'britain', 'was', 'the', 'insertion', 'of', 'a', 'large', ',', 'tandem', '##ly', 'repeated', ',', 'trans', '##po', '##sable', 'element', 'into', 'the', 'first', 'intro', '##n', 'of', 'the', 'gene', 'cortex', '.', '[SEP]']


In [17]:
print(type(train_tokenized))
print(len(train_tokenized))
print(train_tokenized[0])
print(len(val_tokenized))
print(val_tokenized[0])

<class 'list'>
41583
['[CLS]', 'what', 'type', 'of', 'mutation', 'is', 'causing', 'the', 'industrial', 'mel', '##ani', '##sm', 'ph', '##eno', '##type', 'in', 'pepper', '##ed', 'moths', '?', '[SEP]', 'here', 'we', 'show', 'that', 'the', 'mutation', 'event', 'giving', 'rise', 'to', 'industrial', 'mel', '##ani', '##sm', 'in', 'britain', 'was', 'the', 'insertion', 'of', 'a', 'large', ',', 'tandem', '##ly', 'repeated', ',', 'trans', '##po', '##sable', 'element', 'into', 'the', 'first', 'intro', '##n', 'of', 'the', 'gene', 'cortex', '.', '[SEP]']
10161
['[CLS]', 'does', 'mt', '##or', 'regulate', 'the', 'translation', 'of', 'map', '##ka', '##p', '##k', '##2', '?', '[SEP]', 'consequently', ',', 'mt', '##or', 'inhibition', 'or', 'con', '##sti', '##tu', '##tive', 'activation', 'of', 'z', '##fp', '##36', '##l', '##1', 'imp', '##air', '##s', 'the', 'non', '-', 'cell', '-', 'autonomous', 'effects', 'of', 'sen', '##es', '##cent', 'cells', 'in', 'both', 'tu', '##mour', '-', 'suppress', '##ive', 'and'

In [18]:
#Convert tokenized sentences to respective token ids

def token2ids(tokenized_texts):
  # Set the maximum sequence length. 
  MAX_LEN = 512

  tokens_to_ids = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_texts]
  tokens_to_ids = pad_sequences(tokens_to_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  return tokens_to_ids

In [19]:
train_tokenids = token2ids(train_tokenized)
val_tokenids = token2ids(val_tokenized)
# test_tokenids = token2ids(test_tokenized)

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (544 > 512). Running this sequence through BERT will result in indexing errors


In [20]:
#Create masks for the tokenids

def create_masks(token_ids):
  attention_masks = []

  for tid in token_ids:
    tid_mask = [float(i>0) for i in tid]
    attention_masks.append(tid_mask)
  return attention_masks

In [21]:
train_masks = create_masks(train_tokenids)
val_masks = create_masks(val_tokenids)
# test_masks = create_masks(test_tokenids)

In [22]:
print(len(train_tokenids))
print(len(train_masks))

print(type(train_masks))
print(type(train_masks[0]))

print(type(train_tokenids))
print(type(train_tokenids[0]))

print(type(train_labels))
print(len(train_labels))

print(train_masks[0])
print(train_tokenids[0])
print(train_labels)

41583
41583
<class 'list'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'list'>
41583
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [23]:
print(train_tokenids.shape)
print(len(train_masks))
print(len(train_labels))

print(val_tokenids.shape)
print(len(val_masks))
print(len(val_labels))

(41583, 512)
41583
41583
(10161, 512)
10161
10161


In [24]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_tokenids, dtype=torch.int64)
validation_inputs = torch.tensor(val_tokenids, dtype=torch.int64)
# test_inputs = torch.tensor(test_tokenids, dtype=torch.int64)

train_labels = torch.tensor(train_labels, dtype=torch.int64)
validation_labels = torch.tensor(val_labels, dtype=torch.int64)
# test_labels = torch.tensor(val_labels, dtype=torch.int64)

train_masks = torch.tensor(train_masks, dtype=torch.float32)
validation_masks = torch.tensor(val_masks, dtype=torch.float32)
# test_masks = torch.tensor(val_masks, dtype=torch.float32)

In [25]:
print(type(train_inputs))
print(type(train_labels))
print(type(train_masks))

print(train_inputs.dtype)
print(train_labels.dtype)
print(train_masks.dtype)

print(validation_inputs.dtype)
print(validation_labels.dtype)
print(validation_masks.dtype)

print(train_inputs.shape)
print(train_labels.shape)
print(train_masks.shape)

print(train_inputs[0].shape)
print(train_masks[0].shape)
print(train_labels[0].shape)
print(validation_labels[0].shape)

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
torch.int64
torch.int64
torch.float32
torch.int64
torch.int64
torch.float32
torch.Size([41583, 512])
torch.Size([41583])
torch.Size([41583, 512])
torch.Size([512])
torch.Size([512])
torch.Size([])
torch.Size([])


In [26]:
!pip install transformers
from transformers import get_linear_schedule_with_warmup



In [27]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)

for param in model.bert.parameters():
    param.requires_grad = False
    
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [32]:
#BERT fine-tuning parameters
lr=2e-5
num_warmup_steps = 10
num_training_steps = 1000

optimizer = torch.optim.AdamW(model.parameters(), lr = lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_training_steps)

In [33]:
# Select a batch size for training. 
batch_size = 32

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data) #change to seq sampler next time and try
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [34]:
import random
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [35]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

from sklearn.metrics import f1_score

def flat_f1score(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat, average=None)

In [42]:
from torch.nn import CrossEntropyLoss
import copy

# Store our loss and accuracy for plotting
train_loss_set = []

epochs = 1

for epoch in trange(epochs, desc="Epoch"):  
  ## TRAINING
  # Set our model to training mode
  model.train()  
  tr_loss = 0
  tr_accuracy, tr_f1score = 0, 0
  nb_tr_steps = 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_masks, b_labels = batch

    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    
    # Forward pass
    logits = model(b_input_ids, token_type_ids = None, attention_mask = b_input_masks, labels = None)

    loss_fn = CrossEntropyLoss()
    loss = loss_fn(logits.view(-1, 2), b_labels.view(-1))

    loss.backward()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    optimizer.step()
    scheduler.step()

    tr_loss += loss.item()
    tmp_tr_accuracy = flat_accuracy(logits, label_ids)
    # tmp_tr_f1score = flat_f1score(logits, label_ids)    
    tr_accuracy += tmp_tr_accuracy
    # tr_f1score += tmp_tr_f1score
    nb_tr_steps += 1

  epoch_loss = tr_loss/nb_tr_steps
  epoch_accuracy = tr_accuracy/nb_tr_steps
  # epoch_f1score = tr_f1score/nb_tr_steps
  train_loss_set.append(epoch_loss)

  print("Train loss: {}".format(epoch_loss))
  print("Training Accuracy for epoch: {}".format(epoch_accuracy))




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [28:34<00:00, 1714.64s/it]

Train loss: 0.6027412657095835
Training Accuracy for epoch: 0.7120785256410257





In [45]:
from torch.nn import CrossEntropyLoss
import copy

# Store our loss and accuracy for plotting
train_loss_set = []
val_loss_set = []

best_accuracy_val = 0
best_epoch = -1
best_epoch_weights = copy.deepcopy(model.state_dict())

epochs = 4

for epoch in trange(epochs, desc="Epoch"):  
  ## TRAINING
  # Set our model to training mode
  model.train()  
  tr_loss = 0
  tr_accuracy, tr_f1score = 0, 0
  nb_tr_steps = 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_masks, b_labels = batch

    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    
    # Forward pass
    logits = model(b_input_ids, token_type_ids = None, attention_mask = b_input_masks, labels = None)

    loss_fn = CrossEntropyLoss()
    loss = loss_fn(logits.view(-1, 2), b_labels.view(-1))

    loss.backward()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    optimizer.step()
    scheduler.step()

    tr_loss += loss.item()
    tmp_tr_accuracy = flat_accuracy(logits, label_ids)
    # tmp_tr_f1score = flat_f1score(logits, label_ids)    
    tr_accuracy += tmp_tr_accuracy
    # tr_f1score += tmp_tr_f1score
    nb_tr_steps += 1

  epoch_loss = tr_loss/nb_tr_steps
  epoch_accuracy = tr_accuracy/nb_tr_steps
  # epoch_f1score = tr_f1score/nb_tr_steps
  train_loss_set.append(epoch_loss)

  print("Train loss: {}".format(epoch_loss))
  print("Training Accuracy for epoch: {}".format(epoch_accuracy))

  model.eval()

  eval_loss, eval_accuracy, eval_f1score = 0, 0, 0
  nb_eval_steps = 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:

    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_masks, b_labels = batch

    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
  
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids = None, attention_mask = b_input_masks, labels = None)
          
    loss_fn = CrossEntropyLoss()
    loss = loss_fn(logits.view(-1, 2), b_labels.view(-1))

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    eval_loss += loss.item()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    # tmp_eval_f1score = flat_f1score(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    # eval_f1score += tmp_eval_f1score
    nb_eval_steps += 1

  val_epoch_loss = eval_loss/nb_eval_steps
  val_epoch_accuracy = eval_accuracy/nb_eval_steps
  # epoch_f1score = eval_f1score/nb_eval_steps
  val_loss_set.append(val_epoch_loss)


  print("Validation loss for epoch: {}".format(val_epoch_loss))
  print("Validation Accuracy for epoch: {}".format(val_epoch_accuracy))
  # print("Validation f1score for epoch: {}".format(epoch_f1score))

  if (val_epoch_accuracy > best_accuracy_val):
    best_accuracy_val = val_epoch_accuracy
    best_epoch = epoch 
    torch.save(model.state_dict(), os.path.join(DATA_DIR, 'epoch_{}.pth'.format(epoch)))

print(" Best epoch: {}".format(best_epoch))
print(" Best Accuracy: {}".format(best_accuracy_val))




Epoch:   0%|          | 0/4 [00:00<?, ?it/s][A[A[A

Train loss: 0.6015919754367608
Training Accuracy for epoch: 0.7121746794871795
Validation loss for epoch: 0.6047609760513846
Validation Accuracy for epoch: 0.7060731132075472





Epoch:  25%|██▌       | 1/4 [35:29<1:46:28, 2129.57s/it][A[A[A

Train loss: 0.6011969084006089
Training Accuracy for epoch: 0.7121057692307692





Epoch:  50%|█████     | 2/4 [1:11:03<1:11:01, 2130.96s/it][A[A[A

Validation loss for epoch: 0.6047609760513846
Validation Accuracy for epoch: 0.7060731132075472
Train loss: 0.6009242728123299
Training Accuracy for epoch: 0.7126314102564103





Epoch:  75%|███████▌  | 3/4 [1:46:34<35:30, 2130.91s/it]  [A[A[A

Validation loss for epoch: 0.6047609760513846
Validation Accuracy for epoch: 0.7060731132075472
Train loss: 0.6012528790648167
Training Accuracy for epoch: 0.7125352564102564





Epoch: 100%|██████████| 4/4 [2:22:08<00:00, 2132.02s/it]

Validation loss for epoch: 0.6047609760513846
Validation Accuracy for epoch: 0.7060731132075472
 Best epoch: 0
 Best Accuracy: 0.7060731132075472





In [46]:
#Code to load jsonfile

from pandas.io.json import json_normalize 

with open('/content/drive/MyDrive/Thesis_B/8B5_golden.json', 'r') as json_file:
    data = json.load(json_file)

In [47]:
##Code to get questions list

data_access = data['questions']
id_list = []
type_list = []
body_list = []
all_text_list = []
for i in range(len(data_access)):
  text_list = []
  id_list.append(data_access[i]['id'])
  type_list.append(data_access[i]['type'])
  body_list.append(data_access[i]['body'])
  for j in data_access[i]['snippets']:
    text_list.append(j['text'])
  all_text_list.append(text_list)
  # list_concat = body_list+text_list

In [48]:
test8b_b5 = pd.DataFrame()
test8b_b5['id'] = id_list
test8b_b5['body'] = body_list
test8b_b5['type'] = type_list
test8b_b5['sentences'] = all_text_list

In [49]:
test8b_b5[:10]

Unnamed: 0,id,body,type,sentences
0,5d387360a1e1595105000007,What are the effects of CAMK4 inhibition?,list,[CaMK4-dependent activation of AKT/mTOR and CR...
1,5e4949d36d0a277941000006,List cohesinopathies,list,[Genetic mapping studies reveal that mutations...
2,5e4b540b6d0a27794100001c,Which molecule is targeted by Camrelizumab?,factoid,[Camrelizumab (SHR-1210) is a humanised anti-p...
3,5e822615835f4e4777000034,Which protein is mutated in Erythropoietic Pro...,factoid,[Erythropoietic protoporphyria (EPP) is a rare...
4,5e476da1d14c9f295d000002,Is marimastat effective for small-cell lung ca...,yesno,[The phase III trial in small cell lung cancer...
5,5d386ed6a1e1595105000004,Which part of the TNFR2 gene is genetically as...,factoid,[A TNFR2 3' flanking region polymorphism in sy...
6,5e51de866d0a27794100003f,What is VISMapper?,summary,[VISMapper: ultra-fast exhaustive cartography ...
7,5e3c83c548dab47f26000001,What is Taupathy?,summary,[Tauopathies are a group of neurodegenerative ...
8,5e4b64126d0a277941000028,Is Figitumumab effective for non-small cell lu...,yesno,"[A phase III study failed for carboplatin, pac..."
9,5e3ab58db5b409ea5300001c,The LINCS L1000 data set contains gene express...,yesno,[ Library of Integrated Network-based Cellular...


In [50]:
qid_test8b_b5 = test8b_b5['id']
type_test8b_b5= test8b_b5['type']
sentences_test8b_b5 = test8b_b5['sentences']
questions_test8b_b5 = test8b_b5['body']
qid_list_test8b_b5 = list(qid_test8b_b5)
type_list_test8b_b5 = list(type_test8b_b5)
sentences_list_test8b_b5 = list(sentences_test8b_b5)
questions_list_test8b_b5 = list(questions_test8b_b5)

In [51]:
print(sentences_list_test8b_b5[:10])
questions_list_test8b_b5[:10]

[['CaMK4-dependent activation of AKT/mTOR and CREM-α underlies autoimmunity-associated Th17 imbalance', ' Here, we present evidence that the calcium/calmodulin-dependent protein kinase IV (CaMK4) is increased and required during Th17 cell differentiation.', 'Inhibition of CaMK4 reduced Il17 transcription through decreased activation of the cAMP response element modulator α (CREM-α) and reduced activation of the AKT/mTOR pathway, which is known to enhance Th17 differentiation.', 'CaMK4 inhibition has potential as a therapeutic strategy for Th17-driven autoimmune diseases.', 'CAMK4 knockdown and kinase-dead mutant inhibited crocin-mediated HO-1 expression, Nrf2 activation, and phosphorylation of Akt, indicating that HO-1 expression is mediated by CAMK4 and that Akt is a downstream mediator of CAMK4 in crocin signaling', 'CaMK4 inhibition significantly decreased the levels of glycolytic intermediates such as glucose-6-phosphate, fructose-6-phosphate, fructose-1,6-diphosphate, pyruvate, an

['What are the effects of CAMK4 inhibition?',
 'List cohesinopathies',
 'Which molecule is targeted by Camrelizumab?',
 'Which protein is mutated in Erythropoietic Protoporphyria?',
 'Is marimastat effective for small-cell lung cancer?',
 'Which part of the TNFR2 gene is genetically associated with Systemic Lupus Erythematosus?',
 'What is VISMapper?',
 'What is Taupathy?',
 'Is Figitumumab effective for non-small cell lung cancer?',
 'The LINCS L1000 data set contains gene expression data for drug treated human cells, yes or no?']

In [52]:
question_sentence_list_test8b_b5 = []
for i in range(len(questions_list_test8b_b5)):
  each_list = []
  for j in sentences_list_test8b_b5[i]:
    each_list= each_list + ["[CLS] " + questions_list_test8b_b5[i] + " [SEP] " + j + " [SEP]"]
  question_sentence_list_test8b_b5.append(each_list)

In [53]:
question_sentence_list_test8b_b5[0]

['[CLS] What are the effects of CAMK4 inhibition? [SEP] CaMK4-dependent activation of AKT/mTOR and CREM-α underlies autoimmunity-associated Th17 imbalance [SEP]',
 '[CLS] What are the effects of CAMK4 inhibition? [SEP]  Here, we present evidence that the calcium/calmodulin-dependent protein kinase IV (CaMK4) is increased and required during Th17 cell differentiation. [SEP]',
 '[CLS] What are the effects of CAMK4 inhibition? [SEP] Inhibition of CaMK4 reduced Il17 transcription through decreased activation of the cAMP response element modulator α (CREM-α) and reduced activation of the AKT/mTOR pathway, which is known to enhance Th17 differentiation. [SEP]',
 '[CLS] What are the effects of CAMK4 inhibition? [SEP] CaMK4 inhibition has potential as a therapeutic strategy for Th17-driven autoimmune diseases. [SEP]',
 '[CLS] What are the effects of CAMK4 inhibition? [SEP] CAMK4 knockdown and kinase-dead mutant inhibited crocin-mediated HO-1 expression, Nrf2 activation, and phosphorylation of 

In [54]:
token_ids_test8b_b5_all = []
masks_test8b_b5_all = []
for ques_ans_sent in question_sentence_list_test8b_b5:
  # token_ids_individual = []
  # masks_individual = []
  tokenized_test8b_b5 = tokenize_sent(ques_ans_sent)
  token_ids_test8b_b5 = token2ids(tokenized_test8b_b5)
  masks_test8b_b5 = create_masks(token_ids_test8b_b5)
  token_ids_test8b_b5_all.append(token_ids_test8b_b5)
  masks_test8b_b5_all.append(masks_test8b_b5)

In [55]:
#Batch5 test data

print(len(token_ids_test8b_b5_all))
print(len(masks_test8b_b5_all))

print(type(token_ids_test8b_b5_all[0]))
print(type(masks_test8b_b5_all[0]))

print((token_ids_test8b_b5_all[0]))
print((masks_test8b_b5_all[0]))

100
100
<class 'numpy.ndarray'>
<class 'list'>
[[ 101 2054 2024 ...    0    0    0]
 [ 101 2054 2024 ...    0    0    0]
 [ 101 2054 2024 ...    0    0    0]
 [ 101 2054 2024 ...    0    0    0]
 [ 101 2054 2024 ...    0    0    0]
 [ 101 2054 2024 ...    0    0    0]]
[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [58]:
topsentences_summary = []

for i in range(len(token_ids_test8b_b5_all)):
  batch_size = len(token_ids_test8b_b5_all[i])

  test_inputs = torch.tensor(token_ids_test8b_b5_all[i], dtype=torch.int64)
  test_masks = torch.tensor(masks_test8b_b5_all[i], dtype=torch.float32)

  test_data = TensorDataset(test_inputs, test_masks)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  logits_list_test8b_b5 = []

  model.eval()

  # Evaluate data for one epoch
  for batch in test_dataloader:

    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_masks = batch
    
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids = None, attention_mask = b_input_masks, labels = None)   
      logits_list_test8b_b5.append(logits)
    
    prediction = torch.sigmoid(logits)
    print("Softmax:{}".format(prediction))

    if len(prediction) >= 5:
      values, indices = torch.topk(prediction,5,dim=0)
      print("top_indices:{}".format(indices))
      indexes = indices[:,0].tolist()
      each_summ = []
      for ind in indexes:
        each_summ.append(test8b_b5['sentences'][i][ind])
      each_summ = ' '.join(map(str, each_summ))
      topsentences_summary.append(each_summ)
      print(each_summ)
    else:
      values, indices = torch.topk(prediction,len(prediction),dim=0)
      print("top_indices:{}".format(indices))
      indexes = indices[:,0].tolist()
      each_summ = []
      for ind in indexes:
        each_summ.append(test8b_b5['sentences'][i][ind])
      each_summ = ' '.join(map(str, each_summ))
      topsentences_summary.append(each_summ)
      print(each_summ)

    # Move logits to CPU
    logits = logits.detach().cpu().numpy()
    print("Logits:{}".format(logits))

print("summary:{}".format(topsentences_summary))


Softmax:tensor([[0.6431, 0.3919],
        [0.6384, 0.3943],
        [0.6191, 0.3848],
        [0.6269, 0.3881],
        [0.6141, 0.3827],
        [0.6263, 0.3861]], device='cuda:0')
top_indices:tensor([[0, 1],
        [1, 0],
        [3, 3],
        [5, 5],
        [2, 2]], device='cuda:0')
CaMK4-dependent activation of AKT/mTOR and CREM-α underlies autoimmunity-associated Th17 imbalance  Here, we present evidence that the calcium/calmodulin-dependent protein kinase IV (CaMK4) is increased and required during Th17 cell differentiation. CaMK4 inhibition has potential as a therapeutic strategy for Th17-driven autoimmune diseases. CaMK4 inhibition significantly decreased the levels of glycolytic intermediates such as glucose-6-phosphate, fructose-6-phosphate, fructose-1,6-diphosphate, pyruvate, and lactate, whereas it did not affect the levels of the pentose phosphate pathway intermediates such as 6-Phospho-D-gluconate, ribulose-5-phosphate, ribose-5-phosphate and phosphoribosyl pyrophosp

In [59]:
qid_8b_5b = test8b_b5['id']
type_8b_5b = test8b_b5['type']
summaries_8b_5b = topsentences_summary

In [61]:
print(len(qid_8b_5b))
print(len(type_8b_5b))
print(len(summaries_8b_5b))

100
100
100


In [63]:
question_details = []
for i in range(len(qid_8b_5b)):
  dicti = {"id" : qid_8b_5b[i], "ideal_answer" : summaries_8b_5b[i], "exact_answer" : "yes"}
  question_details.append(dicti)

In [64]:
import json

x = {"questions" : question_details}

# Serializing json 
json_object = json.dumps(x, indent = 2)

with open('/content/drive/MyDrive/Thesis_B/BIOASQ_8b_batch5_results', 'w') as outfile:
    outfile.write(json_object)