In [3]:
import json
import glob
import collections
from itertools import chain
from typing import Any, Dict, Iterator, List, Tuple, Union
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [4]:
def read_json_file(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

In [5]:
def data_preprocessing(dataset):
    contexts, questions, answers = [], [], []
    for group in dataset:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

In [6]:
model = AutoModelForQuestionAnswering.from_pretrained("saiful9379/Bangla_Roberta_Question_and_Answer")
tokenizer = AutoTokenizer.from_pretrained("saiful9379/Bangla_Roberta_Question_and_Answer", use_fast=True)

In [9]:
bn_val_data_path = "./bn_dataset/bn/val"
bn_val_files = glob.glob(bn_val_data_path+"/*.json")
bn_val_list = [i["data"] for i in bn_val_files]
bn_val_data_list= list(map(read_json_file, bn_val_files))
bn_val_data = list(chain(* bn_val_data_list))

TypeError: string indices must be integers

In [None]:
test_contexts, test_questions, test_answers = data_preprocessing(bn_val_data)

In [None]:
squad_test = {'answers': test_answers,'context': test_contexts, 'question': test_questions}

In [None]:

for answer, context, question in zip(squad_test["answers"], squad_test["context"], squad_test["question"]):
    QA = pipeline('question-answering', model=model, tokenizer=tokenizer)
    QA_input = {'question': question,'context':context}
    prediction = QA(QA_input)
    gt = answer["text"]
    pt = prediction["answer"]
    
    print("Context : ", context)
    print("Question : ", question)
    print("GT Answer :", gt)
    print("Prediction : ", pt)
    print("="*40)