In [None]:
import os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import *
from datasets import get_dataset_config_names, load_dataset
import tqdm.notebook as tqdm
# from torch.utils.data import DataLoader

# Load Test

In [None]:
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
test_names, test = [], []
for f in list(os.listdir('../input/feedback-prize-2021/test')):
    test_names.append(f.replace('.txt', ''))
    test.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
test = pd.DataFrame({'id': test_names, 'text': test})
test.head()

In [None]:
test.columns = ["id", "context"]

In [None]:
labels = ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
             'Counterclaim', 'Rebuttal']

In [None]:
test['question'] = [labels for row in range(test.shape[0])]

In [None]:
test = test.explode('question')

In [None]:
IDS = test.id.unique()
print('There are',len(IDS),'train texts.')

In [None]:
#model_ckpt = "distilbert-base-cased-distilled-squad"
model_checkpoint = "../input/q-a-pytorch/model.h5"
# model_checkpoint = "distilbert-base-cased-distilled-squad"
config_model = "../input/q-a-pytorch/model.h5/config.json"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4,
             'Counterclaim':5, 'Rebuttal':6}

In [None]:
# os.mkdir('model')

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
# tokenizer.save_pretrained('model')

# #     config_model = AutoConfig.from_pretrained(model_checkpoint) 
# #     config_model.num_labels = 15
# #     config_model.save_pretrained('model')

# backbone = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
# backbone.save_pretrained('model')

In [None]:
max_length = tokenizer.model_max_length
stride = 64

In [None]:
def preprocess_validation_examples(examples):
    #strip removes leading and trailing whitespaces
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
#         offset = inputs["offset_mapping"][i]
#         inputs["offset_mapping"][i] = [
#             o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
#         ]

    inputs["example_id"] = example_ids
    
    return inputs

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(test)

In [None]:
dataset.column_names

# Build Model
We will use LongFormer backbone and add our own NER head using one hidden layer of size 256 and one final layer with softmax. We use 15 classes because we have a `B` class and `I` class for each of 7 labels. And we have an additional class (called `O` class) for tokens that do not belong to one of the 14 classes.

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
# test_set

In [None]:
pipe = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

In [None]:
def preprocess(question, context, text_id):
    inputs = tokenizer(
        question,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors = 'pt'
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
#     example_ids = []

#     for i in range(len(inputs["input_ids"])):
#         sample_idx = sample_map[i]
#         example_ids.append(text_id)
#         sequence_ids = inputs.sequence_ids(i)

#     inputs["example_id"] = example_ids
    
    return inputs, offset_mapping

In [None]:
def get_predictions(outputs, inputs, offsets, context):
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    #print(start_logits.shape, end_logits.shape)
    sequence_ids = inputs.sequence_ids()
    # Mask everything apart from the tokens of the context
    mask = [i != 1 for i in sequence_ids]
    # Unmask the [CLS] token
    mask[0] = False
    # Mask all the [PAD] tokens
    mask = torch.logical_or(torch.tensor(mask)[None], (inputs["attention_mask"] == 0))

    start_logits[mask] = -10000
    end_logits[mask] = -10000
    start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
    end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)
    
    candidates = []
    for start_probs, end_probs in zip(start_probabilities, end_probabilities):
        scores = start_probs[:, None] * end_probs[None, :]
        idx = torch.triu(scores).argmax().item()

        start_idx = idx // scores.shape[0]
        end_idx = idx % scores.shape[0]
        score = scores[start_idx, end_idx].item()
        candidates.append((start_idx, end_idx, score))

    #print(candidates)
    results = []
    for candidate, offset in zip(candidates, offsets):
        start_token, end_token, score = candidate
        start_char, _ = offset[start_token]
        _, end_char = offset[end_token]
        answer = context[start_char:end_char]
        result = {"answer": answer, "start": start_char, "end": end_char, "score": score}
        results.append(result)
    return results

In [None]:
def get_predictionstring(result):
    pred = result['answer']
    if len(pred) == 0:
        return ""
    start = result['start']
    end = result['end']
    score = result['score']
    start_index = len(context[:start].split())
    end_index = start_index + len(pred.split())
    predictionstring = ""
    for i in range(start_index, end_index):
        predictionstring += str(i) + " "
    return predictionstring

In [None]:
test['predictionstring'] = ""

In [None]:
predictions_dict = {}
predictions_dict['id'] = []
predictions_dict['class'] = []
predictions_dict['predictionstring'] = []

In [None]:
for item in dataset:
    question = item['question']
    text_id = item['id']
    context = item['context']
#     inputs, offset_mapping = preprocess(question, context, text_id)
#     outputs = model(**inputs)
#     results_model = get_predictions(outputs, inputs, offset_mapping, context)
    with torch.no_grad():
        results = pipe(question = question, context = context, top_k = 2, max_seq_len = max_length, max_answer_len = max_length)
    for result in results:
        predictionstring = get_predictionstring(result)
        if len(predictionstring) == 0:
            continue
        predictions_dict['id'].append(text_id)
        predictions_dict['class'].append(question)
        predictions_dict['predictionstring'].append(predictionstring)
#         counter += 1
#         if counter > 3:
#             break
            

In [None]:
sub = pd.DataFrame.from_dict(predictions_dict)

In [None]:
sub.to_csv("submission.csv", index = False)