In [None]:
import os

In [None]:
from datasets import get_dataset_config_names, load_dataset

In [None]:
qa_cols = ["title", "question", "answers.text",
           "answers.answer_start", "context"]

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import *

# Load Test

In [None]:
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
test_names, test = [], []
for f in list(os.listdir('../input/feedback-prize-2021/test')):
    test_names.append(f.replace('.txt', ''))
    test.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
test = pd.DataFrame({'id': test_names, 'text': test})

In [None]:
test.columns = ["id", "context"]

In [None]:
labels = ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
             'Counterclaim', 'Rebuttal']

In [None]:
test['question'] = [labels for row in range(test.shape[0])]
test = test.explode('question')
#model_ckpt = "distilbert-base-cased-distilled-squad"
model_checkpoint = "../input/q-a-pytorch/model.h5"
# model_checkpoint = "distilbert-base-cased-distilled-squad"
config_model = "../input/q-a-pytorch/model.h5/config.json"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4,
             'Counterclaim':5, 'Rebuttal':6}


In [None]:
max_length = 512
stride = 128
def preprocess_examples(examples):
    #strip removes leading and trailing whitespaces
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    #offset_mapping = inputs.pop("offset_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
#         offset = inputs["offset_mapping"][i]
#         inputs["offset_mapping"][i] = [
#             o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
#         ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(test)

In [None]:
dataset

In [None]:
train_dataset = dataset.map(
    preprocess_examples,
    batched=True,
    remove_columns=dataset.column_names,
)

In [None]:
from datasets import load_metric
metric = load_metric("squad")

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model.eval()

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "distilbert-base-cased-distilled-squad",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

In [None]:
def get_predictions(start_logits, end_logits, offsets, context, question):

    start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
    end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)
    
    candidates = []
    scores = start_probabilities * end_probabilities
    idx = scores.argmax().item()

    start_idx = start_probabilities.argmax().item()
    end_idx = end_probabilities.argmax().item()
    score = scores[idx]
    candidates.append((start_idx, end_idx, score))

    #print(candidates)
    results = []
    for candidate, offset in zip(candidates, offsets):
        start_token, end_token, score = candidate
        start_char, _ = offset[start_token]
        _, end_char = offset[end_token]
        answer = context[start_char:end_char]
        result = {"answer": answer, "start": start_char, "end": end_char, "score": score, "question": questions}
        results.append(result)
    return results

In [None]:
def get_predictionstring(result):
    pred = result['answer']
    if len(pred) == 0:
        return ""
    start = result['start']
    end = result['end']
    score = result['score']
    start_index = len(context[:start].split())
    end_index = start_index + len(pred.split())
    predictionstring = ""
    for i in range(start_index, end_index):
        predictionstring += str(i) + " "
    return predictionstring

In [None]:
outputs = trainer.predict(train_dataset)

In [None]:
outputs.predictions

In [None]:
from torch.utils.data import DataLoader
# from transformers import default_data_collator
dataloader = DataLoader(dataset, batch_size = 1)
tokenized_loader = DataLoader(train_dataset, batch_size = 1)

In [None]:
# index = 0
# for batch, t_batch in zip(dataloader, tokenized_loader):
#     question = batch['question']
#     context = batch['context']
#     text_id = batch['id']
#     inputs, offset_mapping = t_batch["input_ids"], t_batch["offset_mapping"]
#     start_logits, end_logits = torch.Tensor(outputs.predictions[0][index]), torch.Tensor(outputs.predictions[1][index])
#     results = get_predictions(start_logits, end_logits, inputs, offset_mapping, context)