In [None]:
import os

mostly inspired by chris deotte please upvode and check out his notebook: https://www.kaggle.com/code/cdeotte/tensorflow-longformer-ner-cv-0-633

In [None]:
from datasets import get_dataset_config_names, load_dataset

In [None]:
qa_cols = ["title", "question", "answers.text",
           "answers.answer_start", "context"]

# How To Submit TensorFlow Without Internet
Many people ask me, how do I submit TensorFlow models without internet? With HuggingFace Transformer, it's easy. Just download the following 3 things (1) model weights, (2) tokenizer files, (3) config file, and upload them to a Kaggle dataset. Below shows code how to get the files from HuggingFace for AllenAI's model `longformer-base`. But this same code can download any transformer, like for example `roberta-base`.

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import *

# Load Train

In [None]:
train = pd.read_csv('../input/qatrain/qatrain_df.csv')
print( train.shape )
train.head()

In [None]:
train.columns = ["id", "question", "answer", "answer_start", "answer_end", "context"]

In [None]:
labels = ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
             'Counterclaim', 'Rebuttal']

In [None]:
IDS = train.id.unique()
print('There are',len(IDS),'train texts.')

In [None]:
#model_ckpt = "distilbert-base-cased-distilled-squad"
model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4,
             'Counterclaim':5, 'Rebuttal':6}

# Tokenize Train
The following code converts Kaggle's train dataset into a NER token array that we can use to train a NER transformer. I have made it very clear which targets belong to which class. This allows us to very easily convert this code to `Question Answer formulation` if we want. Just change the 14 NER arrays to be 14 arrays of `start position` and `end position` for each of the 7 classes. (You will need to think creatively what to do if a single text has multiple of one class).

In [None]:
n = IDS[0]
name = f'../input/feedback-prize-2021/train/{n}.txt'
name

In [None]:
txt = open(name, 'r').read()

In [None]:
txt.split()[:5]

In [None]:
train.question[0:3]

In [None]:
question = train.question[0:3].to_list()
context = train.context[0:3].to_list()
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping = True
)

In [None]:
inputs.keys()

In [None]:
print(f"The 3 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

In [None]:
answers = train.answer[0:3].to_list()
start_positions = []
end_positions = []

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    answer = answers[sample_idx]
    start_char = train.answer_start[i]
    end_char = train.answer_end[i]
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

In [None]:
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = train.answer[sample_idx]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: \n{answer} \nlabels give: \n{labeled_answer}")

In [None]:
idx = 4
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = train.answer[sample_idx]

decoded_example = tokenizer.decode(inputs["input_ids"][idx])
print(f"Theoretical answer: \n{answer} \ndecoded example: \n{decoded_example}")

In [None]:
tokenizer.model_max_length

In [None]:
max_length = tokenizer.model_max_length
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answer"]
    starts = examples["answer_start"]
    ends = examples["answer_end"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = starts[sample_idx]
        end_char = ends[sample_idx]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(train)

In [None]:
dataset

In [None]:
train_dataset = dataset.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset.column_names,
)

In [None]:
from datasets import load_metric

metric = load_metric("squad")

In [None]:
train_dataset

In [None]:
df = train.loc[train.id == n]

In [None]:
df.head()

In [None]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("WANDB")
    wandb.login(key=api_key)
    anonymous = None
except:
    wandb.login(anonymous='must')
    print('To use your W&B account,\nGo to Add-ons -> Secrets and provide your W&B access token. Use the Label name as WANDB. \nGet your W&B access token from here: https://wandb.ai/authorize')

# Build Model
We will use LongFormer backbone and add our own NER head using one hidden layer of size 256 and one final layer with softmax. We use 15 classes because we have a `B` class and `I` class for each of 7 labels. And we have an additional class (called `O` class) for tokens that do not belong to one of the 14 classes.

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "distilbert-base-cased-distilled-squad",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
trainer.save_model("model.h5")