In [1]:
! pip install transformers 
! pip install datasets 
! pip install huggingface_hub

[0mCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 KB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.1.0 responses-0.18.0 xxhash-3.0.0
[0m

In [2]:
import transformers
print(transformers.__version__)

4.17.0


### Provide the checkpoint of fined-tuned model

In [3]:
# we have saved all the finetuned model on huggingface model hub
model_checkpoint = "krinal214/augmented_Squad_Translated" #change the model
batch_size = 16

In [4]:
from datasets import load_dataset, load_metric

In [5]:
datasets=load_dataset("tydiqa","secondary_task")
datasets

Downloading builder script:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading and preparing dataset tydiqa/secondary_task (download: 1.82 GiB, generated: 55.27 MiB, post-processed: Unknown size, total: 1.87 GiB) to /root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/161M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/58.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.62M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/49881 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5077 [00:00<?, ? examples/s]

Dataset tydiqa downloaded and prepared to /root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 49881
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5077
    })
})

In [6]:
import re
def filter_dataset(ds,lang):
    def check(d):
        for i in lang:
            if re.search(i,d):
                return True
        return False
    ds = ds.filter(lambda x: check(x['id']))
    return ds

In [7]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

### Preparing the validation dataset

In [8]:
datasets["validation"]=filter_dataset(datasets["validation"],['bengali','telugu'])

  0%|          | 0/6 [00:00<?, ?ba/s]

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/676M [00:00<?, ?B/s]

In [11]:
def prepare_validation_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

### Evaluating the model

In [12]:
max_answer_length = 50
n_best_answers=20
max_length=384
doc_stride=160

In [13]:
from tqdm.auto import tqdm
import numpy as np

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 50):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    predictions = collections.OrderedDict()
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[start_index])==0
                        or len(offset_mapping[end_index])==0
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    try:
                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append(
                            {
                                "score": start_logits[start_index] + end_logits[end_index],
                                "text": context[start_char: end_char]
                            }
                        )
                    except IndexError:
                        continue

        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]
        
    return predictions


In [14]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_eval_batch_size=batch_size,
    output_dir = 'validation_results'
)

trainer = Trainer(
    model,
    args,
    tokenizer=tokenizer
)

In [15]:
import collections
def evaluate(ds,wrong_pred,language):
    val_dataset=filter_dataset(ds,language)
    validation_features = val_dataset.map(
        prepare_validation_features,
        batched=True,
        remove_columns=val_dataset.column_names
    )
    raw_predictions = trainer.predict(validation_features)
    validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))
    
    examples = val_dataset
    features = validation_features

    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    final_predictions = postprocess_qa_predictions(val_dataset, validation_features, raw_predictions.predictions,n_best_answers,max_answer_length)
    metric = load_metric("squad") #reusing the squad evaluation script on tydiqa dataset
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in val_dataset]
    for j in range(len(references)):
        wrong=True
        for t in references[j]["answers"]["text"]:
            if normalize_answer(formatted_predictions[j]['prediction_text'])==normalize_answer(t):
                wrong=False
                break
        if wrong:
            wrong_pred.append({"id":references[j]["id"],"prediction":formatted_predictions[j]["prediction_text"],"original:":t})
    return metric.compute(predictions=formatted_predictions, references=references)

### Result on validation dataset: (F1 score and EM score)

In [16]:
import string
eval_languages = [['bengali'], ['telugu']]
wrong_prediction={}
wrong_prediction['bengali']=[]
wrong_prediction['telugu']=[]
wrong_prediction['all']=[]
for lang in eval_languages:
    output = evaluate(datasets["validation"],wrong_prediction[lang[0]],lang)
    print(lang[0],':',output)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 144
  Batch size = 16


Post-processing 113 example predictions split into 144 features.


  0%|          | 0/113 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

bengali : {'exact_match': 71.68141592920354, 'f1': 79.757307589166}


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 786
  Batch size = 16


Post-processing 669 example predictions split into 786 features.


  0%|          | 0/669 [00:00<?, ?it/s]

telugu : {'exact_match': 71.30044843049328, 'f1': 84.07107605604045}


In [17]:
print('wrong predictions(bengali)',len(wrong_prediction['bengali']))
print('wrong predictions(telugu)',len(wrong_prediction['telugu']))

wrong predictions(bengali) 32
wrong predictions(telugu) 192


In [18]:
output = evaluate(datasets["validation"],wrong_prediction['all'],['telugu','bengali'])
print('overall',output)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 930
  Batch size = 16


Post-processing 782 example predictions split into 930 features.


  0%|          | 0/782 [00:00<?, ?it/s]

overall {'exact_match': 71.35549872122762, 'f1': 83.4477309962491}


### Visualizing the wrong predictions

In [19]:
import random
picks = random.sample(range(0, len(wrong_prediction['bengali'])-1), min(40, len(wrong_prediction['bengali'])-1))
for i in picks:
    print(wrong_prediction['bengali'][i])

{'id': 'bengali-4841554072514664155-0', 'prediction': 'ইন্দিরা গান্ধী', 'original:': 'শেখ মুজিবুর রহমান'}
{'id': 'bengali-4980664758092265474-1', 'prediction': 'ব্রহ্মা', 'original:': 'সাং চিয়েন'}
{'id': 'bengali--6695334746728483406-2', 'prediction': 'ঝাঁসির রানি ঝাঁসির রানির', 'original:': 'ঝাঁসির রানি'}
{'id': 'bengali-4713184652126684859-6', 'prediction': 'মিত্র কৃষ্ণ', 'original:': 'কৃষ্ণ'}
{'id': 'bengali-6608660370621955794-0', 'prediction': 'ইন্দো-আর্য', 'original:': 'খাড়ি বোলি উপভাষা'}
{'id': 'bengali--8422112448971508966-2', 'prediction': '১০৫টি', 'original:': '১০৫'}
{'id': 'bengali--8404986622637348674-3', 'prediction': 'পাকিস্তান বনাম আইসিসির নতুন সদস্য শ্রীলঙ্কার', 'original:': 'ভারত'}
{'id': 'bengali--9113522782624640859-1', 'prediction': 'চট্টগ্রাম বিশ্ববিদ্যালয়', 'original:': 'চট্টগ্রাম'}
{'id': 'bengali--7381837526075378596-1', 'prediction': 'চতুর্থ থেকে দ্বাদশ শতাব্দীর মধ্যে', 'original:': 'চতুর্থ থেকে দ্বাদশ শতাব্দী'}
{'id': 'bengali-4667499823130888007-3', 'predi

In [20]:
picks = random.sample(range(0, len(wrong_prediction['telugu'])-1), min(40, len(wrong_prediction['telugu'])-1))
for i in picks:
    print(wrong_prediction['telugu'][i])

{'id': 'telugu--5347969429403013814-1', 'prediction': '160 చదరపు కిలోమీటర్ల', 'original:': '2 చదరపు మైళ్ళు)'}
{'id': 'telugu--6194068905241325426-0', 'prediction': '361 హెక్టార్లలో', 'original:': '361 హెక్టార్ల'}
{'id': 'telugu-997773189427554722-0', 'prediction': '282 హెక్టార్లలో', 'original:': '282 హెక్టార్ల'}
{'id': 'telugu-4831434226867109360-7', 'prediction': '53 సంవత్సరాల వయసులో 1915 నవంబర్ 30', 'original:': '53'}
{'id': 'telugu-7280221406834583430-1', 'prediction': 'అలి గడ్', 'original:': 'ఢిల్లి'}
{'id': 'telugu-2355058992665135572-0', 'prediction': '198 హెక్టార్లలో', 'original:': '198 హెక్టార్ల'}
{'id': 'telugu-7783818474710184169-13', 'prediction': 'అనంత్, ఆకాష్,\xa0ఒక\xa0కూతురు\xa0ఇషా', 'original:': 'ఇద్దరు\xa0కొడుకులు\xa0అనంత్, ఆకాష్,\xa0ఒక\xa0కూతురు\xa0ఇషా'}
{'id': 'telugu--3316938560622730206-1', 'prediction': 'రాజయ్యశాస్త్రి మరియు సుచేత', 'original:': 'రాజయ్యశాస్త్రి'}
{'id': 'telugu-8243050460142458655-0', 'prediction': '1961 హెక్టార్లలో', 'original:': '1961 హెక్టార్ల'}