### Model Call

In [8]:
# Google Drive Mounting

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# 'base' variable has to store the path to the folder where the model, test set and validation set of BanglaRQA is saved/available/uploaded/stored
# so set the 'base' variable path to the folder of the where you have uploaded/saved the model, test set and validation set
base = '/content/drive/MyDrive/BQA'       #sample

# 'test_file_name' variable has to store file name of the test set of BanglaRQA
# so set the 'test_file_name' variable as the file anme of the test set of BanglaRQA
test_file_name = 'Test.json' #sample

# 'validation_file_name' variable has to store file name of the validation set of BanglaRQA
# so set the 'validation_file_name' variable as the file anme of the validation set of BanglaRQA
validation_file_name = 'Validation.json'      #sample


# 'model_name' variable has to store file name of the saved/uploaded/stroed model that you want to test
# so set the 'model_name' variable as the file anme of saved/uploaded/stroed model you want to test
#model_name = 'model_weights_epoch_10.pth'         #sample
model_name = 'banglaT5_model_weights_epoch_14.pth'  

In [10]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
pip install git+https://github.com/csebuetnlp/normalizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-av4slhv3
  Running command git clone -q https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-av4slhv3


In [12]:
import torch

In [13]:
# calling the model BanglaT5 to use it as a class

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from normalizer import normalize # pip install git+https://github.com/csebuetnlp/normalizer

model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5")
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5", use_fast=False)

In [14]:
from transformers import AdamW

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device

optimizer = AdamW(model.parameters(), lr=5e-5)


#calling the checkpoint and loading the parameters of the saved model for evaluation
checkpoint = torch.load(base + '/'+ model_name)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.to(device)


model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [15]:
from collections import Counter

# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    #'''
    common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
    common_tokens = sum(common_tokens.values())
    #'''

    '''
    common_tokens = set(pred_tokens) & set(truth_tokens)
    common_tokens = len(common_tokens)
    '''
    
    # if there are no common tokens then f1 = 0
    if common_tokens == 0:
        return 0
    
    prec = 1.0 * common_tokens / len(pred_tokens)
    rec = 1.0 * common_tokens / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)


In [16]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def get_updated_f1(prediction, truth):
  # first e prediction k ; diye bhag korte hobe
  # truth keo ; diye bhag korte hobe
  splitted_prediction = prediction.split(';')
  #print(splitted_prediction)
  splitted_truth = truth.split(';')
  #print(splitted_truth)

  # most probably pair wise compute_f1 chalaite hobe
  scores = np.zeros([len(splitted_truth), len(splitted_prediction)])
  for gold_index, gold_item in enumerate(splitted_truth):
    for pred_index, pred_item in enumerate(splitted_prediction):
      scores[gold_index, pred_index] = compute_f1(pred_item, gold_item)

  #print(scores)

  row_ind, col_ind = linear_sum_assignment(-scores)

  max_scores = np.zeros([max(len(splitted_truth), len(splitted_prediction))])
  for row, column in zip(row_ind, col_ind):
    max_scores[row] = max(max_scores[row], scores[row, column])
  
  #print(max_scores)

  # align korte hobe kemne jani na

  # protita prediction er average nite hobe
  # average ta return korte hobe
  f1 = np.mean(max_scores)
  return f1

#get_updated_f1("aaaa qqqq; sdfs fsf; fssfsfs", "aaaa; fsr qqqq; ooooo; a");

In [17]:
import numpy

def get_answer(question,context):
  source_encoding=tokenizer(
    normalize(question),
    normalize(context),
    max_length=1024,
    padding="max_length",
    truncation="only_second",
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt").to(device)
  
  #print(source_encoding)
  
  generated_ids=model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams = 1,
      max_length=256,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping= True,
      use_cache = True,
  )
  
  #print(generated_ids)

  preds=[tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for gen_id in generated_ids]

  return "".join(preds), generated_ids

### BanglaRQA Validation

In [18]:
# loading the validation dataset for calculating the EM and F1 scores on BanglaRQA's validation set

import json
import os

f = open(os.path.join(base,validation_file_name))
  
data_val = json.load(f)
  
data_val.keys()
  
data_val = data_val['data']

In [19]:
context_val = []
question_val = []
answer_val = []
answer_type_val = []

for i in range(len(data_val)):
    for j in range(len(data_val[i]['qas'])):
        context_val.append(normalize(data_val[i]['context']))
        question_val.append(normalize(data_val[i]['qas'][j]['question_text']))
        answer_val.append(normalize(data_val[i]['qas'][j]['answers']['answer_text'][0]))
        answer_type_val.append(normalize(data_val[i]['qas'][j]['answers']['answer_type'][0]))

In [20]:
sample_total = len(context_val)
print('No of sample in Validation set: ', sample_total)

f1 = 0.0
em = 0.0

with torch.no_grad():
    l = len(context_val)
    for i in range(l):
        question = {
            "context": context_val[i],
            "question": question_val[i]
            }

        pred, ids = get_answer(question["question"],question["context"])
        if(answer_type_val[i] == 'multiple spans'):
          f1 = f1 + get_updated_f1(pred, answer_val[i])
        else:
          f1 = f1 + compute_f1(pred, answer_val[i])
        em = em + compute_exact_match(pred, answer_val[i])

print('Validation set EM score: ', em/sample_total)
print('Validation set F1 score: ', f1/sample_total)

No of sample in Validation set:  1484
Validation set EM score:  0.605121293800539
Validation set F1 score:  0.7716536237186012


### BanglaRQA Test

In [21]:
# loading the test dataset for calculating the EM and F1 scores on BanglaRQA's test set

import json
import os

f = open(os.path.join(base,test_file_name))
  
data_test = json.load(f)
  
data_test.keys()

data_test = data_test['data']

In [22]:
context_test = []
question_test = []
answer_test = []
question_type_test = []
answer_type_test = []

for i in range(len(data_test)):
    for j in range(len(data_test[i]['qas'])):
        context_test.append(normalize(data_test[i]['context']))
        question_test.append(normalize(data_test[i]['qas'][j]['question_text']))
        question_type_test.append(normalize(data_test[i]['qas'][j]['question_type']))
        answer_test.append(normalize(data_test[i]['qas'][j]['answers']['answer_text'][0]))
        answer_type_test.append(normalize(data_test[i]['qas'][j]['answers']['answer_type'][0]))

In [23]:
anslysis = {
    'answerable_f1': 0.0,
    'answerable_em': 0.0,
    'answerable_cnt': 0.0,

    'unanswerable_f1': 0.0,
    'unanswerable_em': 0.0,
    'unanswerable_cnt': 0.0,

    'factoid_f1': 0.0,
    'factoid_em': 0.0,
    'factoid_cnt': 0.0,

    'causal_f1': 0.0,
    'causal_em': 0.0,
    'causal_cnt': 0.0,

    'confirmation_f1': 0.0,
    'confirmation_em': 0.0,
    'confirmation_cnt': 0.0,

    'list_f1': 0.0,
    'list_em': 0.0,
    'list_cnt': 0.0,
}

In [24]:
sample_total = len(context_test)
print('No of samples in Test Set: ',sample_total)

f1_total = 0.0
em_total = 0.0

with torch.no_grad():
    l = len(context_test)
    for i in range(l):
        question = {
            "context": context_test[i],
            "question": question_test[i]
            }

        pred, ids = get_answer(question["question"],question["context"])
        if(answer_type_test[i] == 'multiple spans'):
          f1 = get_updated_f1(pred, answer_test[i])
        else:
          f1 = compute_f1(pred, answer_test[i])
        f1_total = f1_total + f1
        em = compute_exact_match(pred, answer_test[i])
        em_total = em_total + em

        if(answer_test[i] == ''):
          anslysis['unanswerable_cnt'] = anslysis['unanswerable_cnt'] +1
          anslysis['unanswerable_f1'] = anslysis['unanswerable_f1'] + f1
          anslysis['unanswerable_em'] = anslysis['unanswerable_em'] + em
        else:
          anslysis['answerable_cnt'] = anslysis['answerable_cnt'] +1
          anslysis['answerable_f1'] = anslysis['answerable_f1'] + f1
          anslysis['answerable_em'] = anslysis['answerable_em'] + em
          
        if(question_type_test[i] == 'factoid'):
          anslysis['factoid_cnt'] = anslysis['factoid_cnt'] +1
          anslysis['factoid_f1'] = anslysis['factoid_f1'] + f1
          anslysis['factoid_em'] = anslysis['factoid_em'] + em
        elif (question_type_test[i] == 'causal'):
          anslysis['causal_cnt'] = anslysis['causal_cnt'] +1
          anslysis['causal_f1'] = anslysis['causal_f1'] + f1
          anslysis['causal_em'] = anslysis['causal_em'] + em
        elif (question_type_test[i] == 'confirmation'):
          anslysis['confirmation_cnt'] = anslysis['confirmation_cnt'] +1
          anslysis['confirmation_f1'] = anslysis['confirmation_f1'] + f1
          anslysis['confirmation_em'] = anslysis['confirmation_em'] + em
        elif (question_type_test[i] == 'list'):
          anslysis['list_cnt'] = anslysis['list_cnt'] +1
          anslysis['list_f1'] = anslysis['list_f1'] + f1
          anslysis['list_em'] = anslysis['list_em'] + em


print('----------------------------------------------------------------------------------------')
print()
print('Test set EM score: ', em_total/sample_total)
print('Test set F1 score: ', f1_total/sample_total)
print()
print('----------------------------------------------------------------------------------------')

print('Accuracy based on answerable/unanserable questions: ------------------------------------')
print()
print('Answerabale_em: ', anslysis['answerable_em']/anslysis['answerable_cnt'])
print('Answerabale_f1: ', anslysis['answerable_f1']/anslysis['answerable_cnt'])
print()
print('Unnswerabale_em: ', anslysis['unanswerable_em']/anslysis['unanswerable_cnt'])
print('Unnswerabale_f1: ', anslysis['unanswerable_f1']/anslysis['unanswerable_cnt'])
print()
print("----------------------------------------------------------------------------------------")

print('Accuracy based on different question type:----------------------------------------------')
print()
print('Factoid_em: ', anslysis['factoid_em']/anslysis['factoid_cnt'])
print('Facoid_f1: ', anslysis['factoid_f1']/anslysis['factoid_cnt'])
print()
print('causal_em: ', anslysis['causal_em']/anslysis['causal_cnt'])
print('causal_f1: ', anslysis['causal_f1']/anslysis['causal_cnt'])
print()
print('confirmation_em: ', anslysis['confirmation_em']/anslysis['confirmation_cnt'])
print('confirmation_f1: ', anslysis['confirmation_f1']/anslysis['confirmation_cnt'])
print()
print('list_em: ', anslysis['list_em']/anslysis['list_cnt'])
print('list_f1: ', anslysis['list_f1']/anslysis['list_cnt'])
print()
print('----------------------------------------------------------------------------------------')

No of samples in Test Set:  1493
----------------------------------------------------------------------------------------

Test set EM score:  0.624246483590087
Test set F1 score:  0.7811025268764222

----------------------------------------------------------------------------------------
Accuracy based on answerable/unanserable questions: ------------------------------------

Answerabale_em:  0.548932384341637
Answerabale_f1:  0.7572829827637882

Unnswerabale_em:  0.8536585365853658
Unnswerabale_f1:  0.8536585365853658

----------------------------------------------------------------------------------------
Accuracy based on different question type:----------------------------------------------

Factoid_em:  0.6563706563706564
Facoid_f1:  0.8033830977421288

causal_em:  0.4968152866242038
causal_f1:  0.6940842042959864

confirmation_em:  0.8676470588235294
confirmation_f1:  0.8676470588235294

list_em:  0.34146341463414637
list_f1:  0.6518900200681835

--------------------------------

In [32]:
answer_anslysis = {
    'single_f1': 0.0,
    'single_em': 0.0,
    'single_cnt': 0.0,

    'multi_f1': 0.0,
    'multi_em': 0.0,
    'multi_cnt': 0.0,

    'yes_f1': 0.0,
    'yes_em': 0.0,
    'yes_cnt': 0.0,
}

In [33]:
sample_total = len(context_test)
print('No of samples in Test set: ', sample_total)

with torch.no_grad():
    l = len(context_test)
    for i in range(l):
        question = {
            "context": context_test[i],
            "question": question_test[i]
            }

        if(answer_test[i] != ''):  
          pred, ids = get_answer(question["question"],question["context"])
          if(answer_type_test[i] == 'multiple spans'):
            f1 = get_updated_f1(pred, answer_test[i])
          else:
            f1 = compute_f1(pred, answer_test[i])
          em = compute_exact_match(pred, answer_test[i])  
          
          if(answer_type_test[i] == 'single span'):
            answer_anslysis['single_cnt'] = answer_anslysis['single_cnt'] +1
            answer_anslysis['single_f1'] = answer_anslysis['single_f1'] + f1
            answer_anslysis['single_em'] = answer_anslysis['single_em'] + em
          elif (answer_type_test[i] == 'multiple spans'):
            answer_anslysis['multi_cnt'] = answer_anslysis['multi_cnt'] +1
            answer_anslysis['multi_f1'] = answer_anslysis['multi_f1'] + f1
            answer_anslysis['multi_em'] = answer_anslysis['multi_em'] + em
          elif (answer_type_test[i] == 'yes/no'):
            answer_anslysis['yes_cnt'] = answer_anslysis['yes_cnt'] +1
            answer_anslysis['yes_f1'] = answer_anslysis['yes_f1'] + f1
            answer_anslysis['yes_em'] = answer_anslysis['yes_em'] + em

print('Accuracy based on different answer type:-----------------------------------------------')
print()
print('single_span_em: ', answer_anslysis['single_em']/answer_anslysis['single_cnt'])
print('single_span_f1: ', answer_anslysis['single_f1']/answer_anslysis['single_cnt'])
print()
print('multiple_spans_em: ', answer_anslysis['multi_em']/answer_anslysis['multi_cnt'])
print('multiple_spans_f1: ', answer_anslysis['multi_f1']/answer_anslysis['multi_cnt'])
print()
print('yes_no_em: ', answer_anslysis['yes_em']/answer_anslysis['yes_cnt'])
print('yes_no_f1: ', answer_anslysis['yes_f1']/answer_anslysis['yes_cnt'])
print()
print('---------------------------------------------------------------------------------------')

No of samples in Test set:  1493
Accuracy based on different answer type:-----------------------------------------------

single_span_em:  0.5672514619883041
single_span_f1:  0.7781555763555685

multiple_spans_em:  0.2077922077922078
multiple_spans_f1:  0.5575523041719999

yes_no_em:  0.8695652173913043
yes_no_f1:  0.8695652173913043

---------------------------------------------------------------------------------------


### bn_squad Test

In [27]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 15.5 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 70.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 70.2 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 73.2 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24

In [28]:
from datasets import load_dataset

In [29]:
test_dataset_squad = load_dataset("csebuetnlp/squad_bn", split="test")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading and preparing dataset squad_bn/squad_bn to /root/.cache/huggingface/datasets/csebuetnlp___squad_bn/squad_bn/0.0.1/17a6d6abc976f299afda17ca9b5ce08a022ecafabe24b3362e16a3093c32df4b...


Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset squad_bn downloaded and prepared to /root/.cache/huggingface/datasets/csebuetnlp___squad_bn/squad_bn/0.0.1/17a6d6abc976f299afda17ca9b5ce08a022ecafabe24b3362e16a3093c32df4b. Subsequent calls will reuse this data.


In [30]:
test_context_squad = []
test_question_squad = []
test_answer_squad = []

for i in range(len(test_dataset_squad)):
    if(len(test_dataset_squad[i]['answers']['text']) != 0):
        test_context_squad.append(normalize(test_dataset_squad[i]['context']))
        test_question_squad.append(normalize(test_dataset_squad[i]['question']))
        test_answer_squad.append(normalize(test_dataset_squad[i]['answers']['text'][0]))
    else:
        test_context_squad.append(normalize(test_dataset_squad[i]['context']))
        test_question_squad.append(normalize(test_dataset_squad[i]['question']))
        test_answer_squad.append(normalize(''))

In [34]:
f1_total = 0.0
em_total = 0.0

sample_total = len(test_context_squad)
print(sample_total)

with torch.no_grad():
    l = len(test_context_squad)
    for i in range(l):
        question = {
            "context": test_context_squad[i],
            "question": test_question_squad[i]
            }

        pred, ids = get_answer(question["question"],question["context"])
        '''print(i)
        print(test_question_squad[i])
        print(pred)
        print(test_answer_squad[i])
        print()'''
        f1_total = f1_total + compute_f1(pred, test_answer_squad[i])
        em_total = em_total + compute_exact_match(pred, test_answer_squad[i])


print('Accuracy of our model on bn_quad dataset: ---------------------------------------------')
print()
print('EM score: ', em_total/sample_total)
print('F1 score: ', f1_total/sample_total)
print()
print('---------------------------------------------------------------------------------------')

2504
Accuracy of our model on bn_quad dataset: ---------------------------------------------

EM score:  0.694888178913738
F1 score:  0.7556817967683611

---------------------------------------------------------------------------------------
