<h1>Answer Span Extraction</h1>



In [None]:
!pip install transformers=="4.25.1" sentencepiece=="0.1.97" utoken=="0.1.8" nltk=="3.8.1" datasets=="2.8.0" torch=="1.13.1+cu116" numpy=="1.21.6" seqeval=="1.2.2" pytorch_lightning=="1.9.0" tqdm=="4.64.1" --quiet

In [None]:
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import random
import numpy as np
import torch
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Span Selection with T5

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)

2023-01-10 11:19:24.710677: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-10 11:19:25.004054: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-10 11:19:25.079914: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-10 11:19:26.001669: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

# Model

Majority of the code here is adapted from [here](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) which uses the pytorch-lightning framework for training neural networks. T5 has shown that it can generate state of the art on many tasks as long as it can be cast as a text-to-text problem

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.hparam = hparam

        self.model = T5ForConditionalGeneration.from_pretrained(
            hparam.model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(
            hparam.model_name_or_path
        )
        self.save_hyperparameters()

    def is_logger(self):
        return True

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparam.learning_rate, eps=self.hparam.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self,
                       epoch=None,
                       batch_idx=None,
                       optimizer=None,
                       optimizer_idx=None,
                       optimizer_closure=None,
                       on_tpu=None,
                       using_native_amp=None,
                       using_lbfgs=None
                       ):

        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(
            self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="train", args=self.hparam)
        dataloader = DataLoader(train_dataset, batch_size=self.hparam.train_batch_size,
                                drop_last=True, shuffle=True, num_workers=2)
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.train_batch_size * max(1, self.hparam.n_gpu)))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="validation", args=self.hparam)
        return DataLoader(val_dataset, batch_size=self.hparam.eval_batch_size, num_workers=2)

In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
args_dict = dict(
    data_dir="./", # path for data files
    output_dir="./tmp", # path to save the checkpoints
    model_name_or_path='csebuetnlp/banglat5',
    tokenizer_name_or_path='csebuetnlp/banglat5',
    max_seq_length=256,
    learning_rate=3e-5,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=3,
    gradient_accumulation_steps=1,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5")

In [None]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time
import numpy as np

In [None]:
class SquadDataset(Dataset):
  def __init__(self, tokenizer, dataset, type_path, max_len_context=128,max_len_ans=30):

    self.data = dataset[type_path]
    self.max_len_context = max_len_context
    self.max_len_ans = max_len_ans
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    for idx in range(len(self.data)):
      input_, target = self.data[idx]["context"], self.data[idx]["answer"]    
      
      input_ = input_.lower()
      target = target.lower()

       # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [input_], max_length=self.max_len_context, padding="max_length", truncation=True, return_tensors="pt"
      )
       # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target],max_length=self.max_len_ans, padding="max_length", truncation=True, return_tensors="pt"
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [None]:
path = Path('./squad1_data/squad1_translated_final_aligned.json')

# Open .json file
with open(path, 'rb') as f:
    squad_dict = json.load(f)

train = {}
train['data'] = []

# Search for each passage, its question and its answer
for gi in range(0,400):
    group = squad_dict['data'][gi]
    for passage in group['paragraphs']:
        context_list = passage['bangla_context_list']
        for qa in passage['qas']:
            data = {}
            qid = qa['id']
            data['id'] = qid
            answer = qa['answers'][0]
            context = context_list[answer['index_c_tran_with_ans']]
            data['context'] = context
            # answer_start = []
            # answer_text = []
            # for answer in qa['answers']:
            if answer['align_score'] >= 0.5 and answer['a_tran'] in context and len(answer['a_tran'])!=0:
                data['answer'] = answer['a_tran']
                train['data'].append(data)


out_file = open('./train_ase_sq1.json', "w")
json.dump(train, out_file, indent = 4) # save whole data replace parts later
out_file.close()

In [None]:
val = {}
val['data'] = []

# Search for each passage, its question and its answer
for gi in range(400,len(squad_dict['data'])):
    group = squad_dict['data'][gi]
    for passage in group['paragraphs']:
        context_list = passage['bangla_context_list']
        for qa in passage['qas']:
            data = {}
            qid = qa['id']
            data['id'] = qid
            answer = qa['answers'][0]
            context = context_list[answer['index_c_tran_with_ans']]
            data['context'] = context
            # answer_start = []
            # answer_text = []
            # for answer in qa['answers']:
            if answer['align_score'] >= 0.5 and answer['a_tran'] in context and len(answer['a_tran'])!=0:
                data['answer'] = answer['a_tran']
                val['data'].append(data)

out_file = open('./val_ase_sq1.json', "w")
json.dump(val, out_file, indent = 4) # save whole data replace parts later
out_file.close()

In [None]:
path = Path('./squad1_data/squad1_dev_translated_final_aligned.json')

# Open .json file
with open(path, 'rb') as f:
    squad_dict = json.load(f)

test = {}
test['data'] = []

# Search for each passage, its question and its answer
for gi in range(len(squad_dict['data'])):
    group = squad_dict['data'][gi]
    for passage in group['paragraphs']:
        context_list = passage['bangla_context_list']
        for qa in passage['qas']:
            data = {}
            qid = qa['id']
            data['id'] = qid
            answer = qa['answers'][0]
            context = context_list[answer['index_c_tran_with_ans']]
            data['context'] = context
            # answer_start = []
            # answer_text = []
            # for answer in qa['answers']:
            if answer['align_score'] >= 0.5 and answer['a_tran'] in context and len(answer['a_tran'])!=0:
                data['answer'] = answer['a_tran']
                test['data'].append(data)

out_file = open('./test_ase_sq1.json', "w")
json.dump(test, out_file, indent = 4) # save whole data replace parts later
out_file.close()

In [None]:
from datasets import load_dataset, load_metric
data_files = {}

data_files["train"] = "./train_ase_sq1.json"
extension = data_files["train"].split(".")[-1]

data_files["validation"] = "./val_ase_sq1.json"
extension = data_files["validation"].split(".")[-1]

data_files["test"] = "./test_ase_sq1.json"
extension = data_files["train"].split(".")[-1]

raw_datasets = load_dataset(
    extension,
    data_files=data_files,
    field="data"
)

Using custom data configuration default-77746c7d4018536c
Found cached dataset json (/home/dlpc01/.cache/huggingface/datasets/json/default-77746c7d4018536c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset = raw_datasets

In [None]:
dataset['train'][0]["answer"]

In [None]:
print(dataset)

In [None]:
input_dataset = SquadDataset(tokenizer=tokenizer, dataset=dataset, type_path='train')

In [None]:
test_dataset = SquadDataset(tokenizer=tokenizer, dataset=dataset, type_path='test')

In [None]:
dataset['test'][0]

In [None]:
data = input_dataset[0]

print(tokenizer.decode(data["source_ids"], skip_special_tokens=False))
print(tokenizer.decode(data["target_ids"], skip_special_tokens=False))

In [None]:
!mkdir -p t5_ner

In [None]:
args = argparse.Namespace(**args_dict)
model = T5FineTuner(args)

In [None]:
# checkpoint_callback = pl.callbacks.ModelCheckpoint(
#     filename=args.output_dir+"/checkpoint.pth", monitor="val_loss", mode="min", save_top_k=5
# )

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    #amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    callbacks=[LoggingCallback()],
)

In [None]:
def get_dataset(tokenizer, type_path, args):
    tokenizer.max_length = args.max_seq_length
    tokenizer.model_max_length = args.max_seq_length
    data_files = {}

    data_files["train"] = "./train_ase_sq1.json"
    extension = data_files["train"].split(".")[-1]

    data_files["validation"] = "./val_ase_sq1.json"
    extension = data_files["validation"].split(".")[-1]

    data_files["test"] = "./test_ase_sq1.json"
    extension = data_files["train"].split(".")[-1]

    dataset = load_dataset(
        extension,
        data_files=data_files,
        field="data"
    )
    
    return SquadDataset(tokenizer=tokenizer, dataset=dataset, type_path=type_path)

In [None]:
trainer = pl.Trainer(**train_params)

In [None]:
trainer.fit(model)

## Load the Stored Model and Evaluate

In [None]:
model = model.load_from_checkpoint("./lightning_logs/version_3/checkpoints/epoch=2-step=26811.ckpt")

In [None]:
def sentence_tokenizer(text: str) -> list:
    terminator = ["৷","|","।", "?", "!"]
    tokens = []
    for i in text:
        if i in terminator:
            my_string = text[:text.index(i)+1]
            text = text[text.index(i)+1:]
            tokens.append(my_string.strip())
    if len(tokens)==0:
        return [text]
    return tokens

In [None]:
f = open("./BARDContexts.json")
text_data = json.load(f)
f.close()

In [None]:
from tqdm.notebook import tqdm

device = "cuda"
model.model.eval()
model = model.to(device)

In [None]:
for data in tqdm(text_data['data']):
    context_list = sentence_tokenizer(data['context'])
    tokenized_inputs = tokenizer(context_list, max_length=256, padding="max_length", truncation=True, return_tensors="pt")
    
    tokenized_inputs = tokenized_inputs.to(device)
    outs = model.model.generate(input_ids=tokenized_inputs['input_ids'],
                            attention_mask=tokenized_inputs['attention_mask'])
    dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip() for ids in outs]
    
    data['sent_list'] = context_list
    data['answers'] = []
    for i in range(len(context_list)):
        if dec[i] in context_list[i]:
            data['answers'].append({'text':dec[i],'sent_with_ans':i})
                                      

  0%|          | 0/20000 [00:00<?, ?it/s]

In [None]:
final_out_file = open('./BARDContextandAnswer.json', "w")
json.dump(text_data, final_out_file, indent = 4) # save whole data replace parts later
final_out_file.close()

In [None]:
print(text_data['data'][49])

{'title': 'state', 'context': 'চাঁপাইনবাবগঞ্জের শিবগঞ্জ উপজেলার কানসাট গোপালনগর এলাকায় ভাড়াবাসা থেকে গতকাল সোমবার ভারতীয় এক লাখ জাল রুপিসহ এক ব্যক্তিকে গ্রেপ্তার করেছে র্যাব-৫। তাঁর নাম মো. মাইদুর (৩৩)।মাইদুর খড়কপুর নিমতলা এলাকার বাসিন্দা। ব্যবসা সূত্রে তিনি গোলাপনগর এলাকায় থাকতেন।চাঁপাইনবাবগঞ্জ র্যাব-৫ ক্যাম্পের সহকারী পুলিশ সুপার (এএসপি) নুরে আলম বলেন, গোপন সংবাদের ভিত্তিতে দুপুর সাড়ে ১২টার দিকে গোপালনগর মোড়ের রবিউল ইসলামের ভাড়াবাড়িতে অভিযান চালিয়ে মাইদুরকে গ্রেপ্তার করা হয়। এ ঘটনায় শিবগঞ্জ থানায় মামলা হয়েছে।', 'sent_list': ['চাঁপাইনবাবগঞ্জের শিবগঞ্জ উপজেলার কানসাট গোপালনগর এলাকায় ভাড়াবাসা থেকে গতকাল সোমবার ভারতীয় এক লাখ জাল রুপিসহ এক ব্যক্তিকে গ্রেপ্তার করেছে র্যাব-৫।', 'তাঁর নাম মো. মাইদুর (৩৩)।', 'মাইদুর খড়কপুর নিমতলা এলাকার বাসিন্দা।', 'ব্যবসা সূত্রে তিনি গোলাপনগর এলাকায় থাকতেন।', 'চাঁপাইনবাবগঞ্জ র্যাব-৫ ক্যাম্পের সহকারী পুলিশ সুপার (এএসপি) নুরে আলম বলেন, গোপন সংবাদের ভিত্তিতে দুপুর সাড়ে ১২টার দিকে গোপালনগর মোড়ের রবিউল ইসলামের ভাড়াবাড়িতে অভিযান চালিয়ে মাইদুরকে গ্রেপ্তার করা হয়।