### Installing Dependencies

In [None]:
!pip install --quiet  datasets #to access squad dataset
!pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
!pip install --quiet  tqdm     #for progress bars
!pip install --quiet transformers # for t5 model
!pip install --quiet tokenizers  #tokenizers from HuggingFace
!pip install --quiet sentencepiece #subword tokenizer used by T5
!pip install --quiet pytorch-lightning # pytorch wrapper
!pip install --quiet torchtext # text utilities

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.2/802.2 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Fetching Datasets

In [None]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
pd.options.display.max_rows, pd.options.display.max_columns  = 100,100

In [None]:
def create_pandas_dataset(data,
                          answer_threshold=7,
                          verbose = False):

  ''' Create a Pandas Dataframe from hugging face dataset.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_long ,count_short = 0 , 0
  result_df  = pd.DataFrame(columns = ['question', 'answer','passage'])
  for index,val in enumerate(tqdm(data)):
      passage = " ".join(val['context']['contexts'])
      question = val['question']
      answer = str(val['final_decision'])
      no_of_words = len(answer.split())
      if no_of_words >= answer_threshold:
          count_long = count_long + 1
          continue
      else:
          result_df.loc[count_short] = [passage] + [answer] + [question]
          count_short = count_short + 1
  if verbose:
    return (result_df,
            count_long,
            count_short)
  else:
    return result_df

In [None]:
train_dataset = load_dataset('pubmed_qa', 'pqa_artificial', split='train[:1000]')
valid_dataset = load_dataset('pubmed_qa', 'pqa_labeled', split='train')
print(f"Train Samples: {len(train_dataset)}")
print(f"Valid Samples: {len(valid_dataset)}")

Train Samples: 1000
Valid Samples: 1000


In [None]:
print(train_dataset)

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 1000
})


In [None]:
sample_validation_dataset = next(iter(valid_dataset))
# pprint (sample_validation_dataset)

context = " ".join(sample_validation_dataset['context']['contexts'])
question = sample_validation_dataset['question']
answer = sample_validation_dataset['final_decision']
print('---------------'*9)
print('\nBreaking it Down\n')
print ("context:",context)
print ("question:",question)
print ("answer:",answer)

---------------------------------------------------------------------------------------------------------------------------------------

Breaking it Down

context: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD)

In [None]:
df_train = create_pandas_dataset(train_dataset)
df_validation = create_pandas_dataset(valid_dataset)
print(f"\n Total Train Samples:{df_train.shape} , Total Validation Samples:{df_validation.shape}")


  0%|          | 0/1000 [00:00<?, ?it/s][A
  9%|▉         | 89/1000 [00:00<00:01, 889.54it/s][A
 18%|█▊        | 178/1000 [00:00<00:00, 867.02it/s][A
 26%|██▋       | 265/1000 [00:00<00:00, 865.81it/s][A
 35%|███▌      | 352/1000 [00:00<00:00, 859.37it/s][A
 44%|████▍     | 438/1000 [00:00<00:00, 849.00it/s][A
 52%|█████▏    | 523/1000 [00:00<00:00, 811.10it/s][A
 61%|██████    | 607/1000 [00:00<00:00, 818.38it/s][A
 69%|██████▉   | 692/1000 [00:00<00:00, 826.19it/s][A
 78%|███████▊  | 776/1000 [00:00<00:00, 829.22it/s][A
 86%|████████▌ | 860/1000 [00:01<00:00, 825.17it/s][A
100%|██████████| 1000/1000 [00:01<00:00, 833.80it/s]

  0%|          | 0/1000 [00:00<?, ?it/s][A
  9%|▉         | 89/1000 [00:00<00:01, 884.94it/s][A
 18%|█▊        | 178/1000 [00:00<00:00, 828.49it/s][A
 26%|██▋       | 264/1000 [00:00<00:00, 840.34it/s][A
 35%|███▍      | 349/1000 [00:00<00:00, 785.50it/s][A
 43%|████▎     | 429/1000 [00:00<00:00, 770.03it/s][A
 51%|█████     | 511/1000 [00:00<0


 Total Train Samples:(1000, 3) , Total Validation Samples:(1000, 3)





In [None]:
# Saving data for future use
df_train.to_parquet('train_pubmed.parquet')
df_validation.to_parquet('validation_pubmed.parquet')

# Creating a Pytorch DataSet for T5 Training and Validation

In [None]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small',model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
class QuestionAnsweringDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=96):
        self.path = filepath
        self.passage_column = "passage"
        self.answer = "answer"
        self.question = "question"
        # self.data = pd.read_csv(self.path)
        self.data = pd.read_parquet(self.path)

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()
        src_mask = self.inputs[index]["attention_mask"].squeeze()
        #squeeze to get rid of the batch dimension
        target_mask = self.targets[index]["attention_mask"].squeeze()
        #convert [batch,dim] to [dim]

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for rownum,val in tqdm(self.data.iterrows()): # Iterating over the dataframe
            #reversed the order of question/answer
            passage,answer,target = val[self.passage_column], val[self.question], val[self.answer]

            input_ = f"context: {answer}  question: {str(passage)}"
            #T5 Input format for question answering tasks
            target = f"answer: {target}"
            #Output format we require
            # print(input_)
            # print(target)

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input,padding='max_length',
                truncation = True,return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output,padding='max_length',
                truncation = True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [None]:
train_path = 'train_pubmed.parquet'
validation_path = 'validation_pubmed.parquet'
train_dataset = QuestionAnsweringDataset(t5_tokenizer,train_path)
validation_dataset = QuestionAnsweringDataset(t5_tokenizer,validation_path)


0it [00:00, ?it/s][A
45it [00:00, 442.01it/s][A
90it [00:00, 418.51it/s][A
134it [00:00, 423.05it/s][A
179it [00:00, 432.46it/s][A
223it [00:00, 425.10it/s][A
266it [00:00, 423.76it/s][A
309it [00:00, 425.58it/s][A
352it [00:00, 411.88it/s][A
394it [00:00, 409.27it/s][A
438it [00:01, 415.48it/s][A
485it [00:01, 430.20it/s][A
529it [00:01, 432.87it/s][A
573it [00:01, 427.17it/s][A
616it [00:01, 410.29it/s][A
658it [00:01, 409.03it/s][A
700it [00:01, 406.64it/s][A
741it [00:01, 401.22it/s][A
782it [00:01, 393.42it/s][A
822it [00:01, 390.42it/s][A
864it [00:02, 398.39it/s][A
904it [00:02, 386.37it/s][A
946it [00:02, 395.98it/s][A
1000it [00:02, 409.33it/s]

0it [00:00, ?it/s][A
46it [00:00, 453.58it/s][A
92it [00:00, 134.02it/s][A
136it [00:00, 194.18it/s][A
180it [00:00, 247.64it/s][A
223it [00:00, 290.56it/s][A
269it [00:01, 332.50it/s][A
315it [00:01, 365.83it/s][A
360it [00:01, 388.53it/s][A
405it [00:01, 405.41it/s][A
449it [00:01, 410.80it/s][A
493

In [None]:
# Data Sample
train_sample = train_dataset[50]
decoded_train_input = t5_tokenizer.decode(train_sample['source_ids'])
decoded_train_output = t5_tokenizer.decode(train_sample['target_ids'])

print(decoded_train_input)
print(decoded_train_output)

context: Mycobacterium abscessus has emerged as a major pathogen in cystic fibrosis (CF) patients and has been associated with poor clinical outcomes, particularly following lung transplant. We investigated the acquisition of this bacterium in a cohort of pediatric CF patients. Demographic and patient location data were used to uncover epidemiological links between patients with genetically related strains of M. abscessus that had been previously typed by variable-number tandem repeat profiling. Whole-genome sequencing was applied to 27 M. abscessus isolates from the 20 patients in this cohort to provide definitive data on the genetic relatedness of strains. Whole-genome sequencing data demonstrated that M. abscessus isolates from 16 patients were unrelated, differing by at least 34 single-nucleotide polymorphisms (SNPs) from any other isolate, suggesting that independent acquisition events have occurred. Only 2 clusters of very closely related (<unk> 25 SNPs) isolates from different p

# Fine Tuning T5

In [None]:
import pytorch_lightning as pl
from torch.optim import AdamW
import argparse
from transformers import (
    get_linear_schedule_with_warmup
  )

class T5Tuner(pl.LightningModule):

    def __init__(self,t5model, t5tokenizer,batchsize=4):
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.batch_size = batchsize

    def forward( self, input_ids, attention_mask=None,
                decoder_attention_mask=None,
                lm_labels=None):
         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

         return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )
        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )
        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size,
                          num_workers=2)

    def val_dataloader(self):
        return DataLoader(validation_dataset,
                          batch_size=self.batch_size,
                          num_workers=2)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=1e-4)
        return optimizer

In [None]:
model = T5Tuner(t5_model,t5_tokenizer)
trainer = pl.Trainer(max_epochs = 5, accelerator='gpu')
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


### Exporting Model

In [None]:
from huggingface_hub import notebook_login

notebook_login()
#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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.model.push_to_hub("t5_small_pubmed_qa_artificial_finetuned")
t5_tokenizer.push_to_hub("t5_small_pubmed_qa_artificial_finetuned")

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ankitgu3/t5_small_pubmed_qa_artificial_finetuned/commit/af0aae5934c0d55392865a9469aba90bd3d1ed23', commit_message='Upload tokenizer', commit_description='', oid='af0aae5934c0d55392865a9469aba90bd3d1ed23', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
df_train.head()

Unnamed: 0,question,answer,passage
0,Chronic rhinosinusitis (CRS) is a heterogeneou...,yes,Are group 2 innate lymphoid cells ( ILC2s ) in...
1,Phosphatidylethanolamine N-methyltransferase (...,yes,Does vagus nerve contribute to the development...
2,Psammaplin A (PsA) is a natural product isolat...,yes,Does psammaplin A induce Sirtuin 1-dependent a...
3,This study examined links between DNA methylat...,yes,Is methylation of the FGFR2 gene associated wi...
4,Tumor microenvironment immunity is associated ...,yes,Do tumor-infiltrating immune cell profiles and...


## Evaluation

In [None]:
question = ""
context = ""
answer = ""
numCorrect = 0
numTotal = 0
for index, row in df_validation.iterrows():
  question = row["passage"]
  context = row["question"]
  answer = row["answer"]
  # print(question, context, answer)
  text = "context: "+context + " " + "question: " + question
  device = 'cuda'
  encoding = t5_tokenizer.encode_plus(text,max_length =512,padding='max_length', truncation = True, return_tensors="pt").to(device)
  # print(encoding.keys())
  input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
  model = model.to('cuda')
  beam_outputs = model.model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_length=72, # How long the generated questions should be
      early_stopping=True,
      num_beams=5,
      num_return_sequences=1
  )
  result = t5_tokenizer.decode(beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
  # print(result)
  # print(answer)
  if(numTotal % 100 == 0):
    print(numTotal)

  if(answer in result):
    numCorrect+=1
  numTotal+=1

0
100
200
300
400
500
600
700
800
900


In [None]:
print(numCorrect, numTotal)

552 1000


In [None]:
print(numCorrect/numTotal)

0.552
