## Question Answering (MRC)

In [None]:
import os
import json
import logging
import collections
import numpy as np

from tqdm.auto import tqdm
from typing import Dict, List, Optional, Tuple
from torch.utils.data import Dataset
from transformers import Trainer
from transformers.trainer_utils import PredictionOutput, EvalPrediction
from datasets import Dataset, load_metric


'''
origin source: huggingface repository(https://github.com/huggingface/transformers/tree/master/examples/pytorch/)
'''

logger = logging.getLogger(__name__)


class QuestionAnsweringTrainer(Trainer):
    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_examples = eval_examples
        self.post_process_function = post_process_function
        print('[init] eval_examples:',eval_examples)

    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        eval_examples = self.eval_examples if self.eval_examples else eval_examples
 
        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        try:
            output = eval_loop(
                eval_dataloader,
                description="Evaluation",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if self.post_process_function is not None and self.compute_metrics is not None:
            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
            metrics = self.compute_metrics(eval_preds)

            # Prefix all keys with metric_key_prefix + '_'
            for key in list(metrics.keys()):
                if not key.startswith(f"{metric_key_prefix}_"):
                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

            self.log(metrics)
        else:
            metrics = {}

        if self.args.tpu_metrics_debug or self.args.debug:
            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            xm.master_print(met.metrics_report())

        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
        return metrics

    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
        predict_dataloader = self.get_test_dataloader(predict_dataset)

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        try:
            output = eval_loop(
                predict_dataloader,
                description="Prediction",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if self.post_process_function is None or self.compute_metrics is None:
            return output

        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
        metrics = self.compute_metrics(predictions)

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)

In [None]:
def postprocess_qa_predictions(
    examples,
    features,
    predictions: Tuple[np.ndarray, np.ndarray],
    version_2_with_negative: bool = False,
    n_best_size: int = 20,
    start_n_top: int = 5,
    end_n_top: int = 5,
    max_answer_length: int = 30,
    null_score_diff_threshold: float = 0.0,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
    log_level: Optional[int] = logging.WARNING,
    id_column_name:str="id",
    eid_column_name:str="example_id"
):
    if len(predictions) != 2:
        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
    all_start_logits, all_end_logits = predictions

    if len(predictions[0]) != len(features):
        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples[id_column_name])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature[eid_column_name]]].append(i)

    # The dictionaries we have to fill.
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    if version_2_with_negative:
        scores_diff_json = collections.OrderedDict()

    # Logging.
    logger.setLevel(log_level)
    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_prediction = None
        prelim_predictions = []

        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]
            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
            # available in the current feature.
            token_is_max_context = features[feature_index].get("token_is_max_context", None)

            # Update minimum null prediction.
            feature_null_score = start_logits[0] + end_logits[0]
            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
                min_null_prediction = {
                    "offsets": (0, 0),
                    "score": feature_null_score,
                    "start_logit": start_logits[0],
                    "end_logit": end_logits[0],
                }
            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Don't consider answer that don't have the maximum context available (if such information is
                    # provided).
                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
                        continue
                    if offset_mapping[start_index] and offset_mapping[end_index]:
                        prelim_predictions.append(
                            {
                                "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
                                "score": start_logits[start_index] + end_logits[end_index],
                                "start_logit": start_logits[start_index],
                                "end_logit": end_logits[end_index],
                            }
                        )

        if version_2_with_negative:
            # Add the minimum null prediction
            print("MIN_NULL:", min_null_prediction)
            prelim_predictions.append(min_null_prediction)
            null_score = min_null_prediction["score"]

        # Only keep the best `n_best_size` predictions.
        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]

        # Add back the minimum null prediction if it was removed because of its low score.
        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
            predictions.append(min_null_prediction)

        # Use the offsets to gather the answer text in the original context.
        context = example["text"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0] : offsets[1]]

        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
        # failure.
        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})

        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
        # the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()

        # Include the probabilities in our predictions.
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob

        # Pick the best prediction. If the null answer is not possible, this is easy.
        if not version_2_with_negative:
            all_predictions[example[id_column_name]] = predictions[0]["text"]
        else:
            # Otherwise we first need to find the best non-empty prediction.
            i = 0
            while predictions[i]["text"] == "":
                i += 1
            best_non_null_pred = predictions[i]

            # Then we compare to the null prediction using the threshold.
            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
            scores_diff_json[example[id_column_name]] = float(score_diff)  # To be JSON-serializable.
            if score_diff > null_score_diff_threshold:
                all_predictions[example[id_column_name]] = ""
            else:
                all_predictions[example[id_column_name]] = best_non_null_pred["text"]

        # Make `predictions` JSON-serializable by casting np.float back to float.
        all_nbest_json[example[id_column_name]] = [
            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
            for pred in predictions
        ]

    # If we have an output_dir, let's save all those dicts.
    if output_dir is not None:
        if not os.path.isdir(output_dir):
            raise EnvironmentError(f"{output_dir} is not a directory.")

        prediction_file = os.path.join(
            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
        )
        nbest_file = os.path.join(
            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
            )

        logger.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")
        logger.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
        if version_2_with_negative:
            logger.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w") as writer:
                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

    return all_predictions, None


def get_train_example_fn(tokenizer, question_column_name='question', context_column_name='text', answer_column_name='answers', pad_on_right=True, max_seq_length=256, doc_stride=128, pad_to_max_length=True, **kwargs):
    pad_on_right = tokenizer.padding_side == "right"
    def prepare_train_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation=True,
            max_length=max_seq_length,
            #stride=doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

    return prepare_train_features


def get_val_example_fn(tokenizer, question_column_name='question', context_column_name='context',  answer_column_name='answers', id_column_name="id", eid_column_name='example_id', pad_on_right=True, max_seq_length=256, doc_stride=128, pad_to_max_length=True):
    pad_on_right = tokenizer.padding_side == "right" 
    def prepare_validation_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation=True,
            max_length=max_seq_length,
            #stride=doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples[eid_column_name] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples[eid_column_name].append(examples[id_column_name][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    return prepare_validation_features


def get_post_process_fn(
        answer_column_name:str='answers',
        version_2_with_negative:bool=False,
        n_best_size:int=20,
        start_n_top: int = 5,
        end_n_top: int = 5,
        max_answer_length:int=30,
        null_score_diff_threshold:float=0.0,
        output_dir:str='./',
        log_level=logging.WARNING,
        prefix: str='eval',
        id_column_name:str="id",
        eid_column_name:str="example_id",
        do_beam_search:bool=False
):
    def post_processing_function(examples, features, predictions, stage=prefix):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        postprocess_func = postprocess_qa_predictions
        predictions, _ = postprocess_func(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=version_2_with_negative,
            n_best_size=n_best_size,
            max_answer_length=max_answer_length,
            null_score_diff_threshold=null_score_diff_threshold,
            output_dir=output_dir,
            log_level=log_level,
            start_n_top= 5,
            end_n_top= 5,
            prefix=stage,
            id_column_name=id_column_name,
            eid_column_name=eid_column_name,
        )
        # Format the result to the format the metric expects.
        if version_2_with_negative:
            formatted_predictions = [
                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

        references = [{"id": ex[id_column_name], "answers": ex[answer_column_name]} for ex in examples]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    return post_processing_function

In [None]:
from sklearn.metrics import accuracy_score

def get_extQA_dataset(df_data):
    map_dict={1:"first", 2:"second", 3:"third", 4:"forth", 5:"fifth"}
    df_data['text'] = df_data['id'].apply(lambda _id: load_text(os.path.join(TRAIN_DIR, f'{_id}.txt')))
    df_data['predictionstring'] = df_data['predictionstring'].apply(lambda s: list(map(int,s.split(" "))))
    df_data['text'] = df_data['text'].apply(lambda x: re.sub('\s',' ',x))
    df_data['answer'] = [ " ".join([t for i,t in enumerate(text.split()) if i in answer]) for text, answer in zip(df_data['text'], df_data['predictionstring'])]

    starts = [context.find(answer) for context, answer in zip(df_data['text'], df_data['answer'])]
    ends = [ sidx+len(answer) for sidx,answer in zip(starts, df_data['answer'])]
    df_data['start_positions'],df_data['end_positions'] = starts,ends

    df_data['order'] = df_data['discourse_type_num'].apply(lambda s:int(s.split()[-1]))
    df_data['order'] = df_data['order'].apply(lambda i: map_dict[i] if i in map_dict else f"{i}th")
    df_data['question'] = [ f"What is the {order} '{d_type}' statement?" for order,d_type in zip(df_data['order'], df_data['discourse_type'])]
    df_data = df_data[['discourse_id', 'text', 'question', 'answer', 'start_positions', 'end_positions']]
    df_data['answers'] = [{"answer_start":[sp],"text":[ans]} for sp, ans in zip(df_data["start_positions"],df_data['answer'])]
    df_data['id'] = df_data.index
    dataset = Dataset.from_pandas(df_data[["id", "discourse_id", "text", "answers", "question"]])
    return dataset
    

def qa_metric(p):
    preds, labels = p
    if not isinstance(preds, tuple) and not isinstance(preds, list):
        if len(preds.shape) == 2 and preds.shape[1] == 1:
            preds = preds[:, 0]
        elif len(preds.shape) - len(labels.shape) == 1:
            preds = np.argmax(preds, axis=-1)
    return accuracy_score(labels, preds)

## Token Classification

In [None]:
import os
import re
import numpy as np
from tqdm.auto import tqdm
from collections import defaultdict, Counter
from datasets import Dataset, load_metric
from sklearn.metrics import classification_report


def load_text(path):
    with open(path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content


def processing_text(text, do_lower_case=False, remove_marks=False):
    if not isinstance(text,str):
        return ""
    text = text.replace(" .", ". ")
    if remove_marks:
        text = re.sub("(?<=[a-zA-Z]),\.","", text)
    if do_lower_case:
        #lower the case if the first character of word is only the upper case(processing for ignoring the proper nouns like 'NASA'.).
        text = " ".join([token.lower() if token[1:]==token[1:].lower() else token for token in text.split()])
    return text


def get_train_data(df_data, text_dir, label_to_ids, do_lower_case=False, remove_marks=False, do_data_manuplating=False):
    data = defaultdict(list)
    for _id, df in tqdm(df_data.groupby('id')):
        origin_text = load_text(os.path.join(text_dir,f'{_id}.txt'))
        tokens = origin_text.split()
        labels = [label_to_ids["O"]]*len(tokens)
        labels = np.full(len(tokens), fill_value=label_to_ids["O"])
        for _type, ids in zip(df['discourse_type'], df['predictions']):
            labels[ids[0]], labels[ids[1:]] = label_to_ids[f"B-{_type}"], label_to_ids[f"I-{_type}"]
        
        data['id'].append(_id)
        data['ner_tags'].append(labels)
        data['tokens'].append([processing_text(token) for token in tokens])
        
    return Dataset.from_dict(data)


def get_example_fn(tokenizer, max_seq_len=1024,truncation=True, padding="max_length", ignore_ids=-100):
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=truncation, max_length=max_seq_len, padding=padding, is_split_into_words=True)
        if "ner_tags" not in examples:
            return tokenized_inputs
        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)  
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:                           
                if word_idx is None:
                    label_ids.append(ignore_ids)
                elif word_idx != previous_word_idx:              
                    label_ids.append(label[word_idx])

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs
    return tokenize_and_align_labels


def sub2word(y_preds, dataset, tokenizer, skip_special_tokens=True, sub_prefix="▁"):
    outputs = []
    for y_pred, input_ids in zip(y_preds, dataset['input_ids']):
        filtered_tokens = tokenizer.convert_ids_to_tokens(input_ids, skip_special_tokens=skip_special_tokens)
        tokens = []
        for pred, token in zip(y_pred, filtered_tokens):
            if token.startswith(sub_prefix):
                tokens.append([])
            tokens[-1].append(pred)
#        outputs.append(np.array([ Counter(t).most_common(1)[0][0] for t in tokens]))
        outputs.append(np.array([ t[0] for t in tokens]))
    return np.array(outputs)


def get_predictions(y_preds, ids, label_to_ids):
    labels = defaultdict(list)
    for key, value in label_to_ids.items():
        labels[key.split('-')[-1]].append(value)
    return [ {_id: [(key, np.where(np.isin(pred, value))[0].tolist()) 
                for key, value in labels.items()]} 
                    for _id, pred in zip(ids, y_preds)]


def get_continuous_series(ids, tol=2):
    outputs = [[ids.pop(0)]]
    for _id in ids:
        if outputs[-1][-1] not in list(range(_id-tol,_id+tol)):
            outputs.append([])
        elif outputs[-1][-1] > _id:
            continue
        outputs[-1].append(_id)
    return [o for o in outputs if o]
    
    
def pred2sub(predictions):
    sub = defaultdict(list)
    for pred in predictions:
        for _id,cat_ids in pred.items():
            for cat,ids in cat_ids:
                if (not ids) or (cat=="O"): # ignore OOF
                    continue
                for seq in get_continuous_series(ids):
                    sub['id'].append(_id)
                    sub['class'].append(cat)
                    sub['predictionstring'].append(
                        " ".join([str(i) for i in range(seq[0], seq[-1]+1)]))
    sub = pd.DataFrame(sub)
#    sub = sub[sub['predictionstring'].apply(lambda x: len(x.split())) > 4]
    
    # Ordering for checking the final prediction outputs (whether correctly predicted or not; if "Lead" outcomes last position, it is incorrect.)
    sub['order'] = sub['predictionstring'].apply(lambda s: int(s.split()[0]))
    sub = sub.sort_values(by=['id','order'])
    return sub.reset_index().drop(['index', 'order'], axis=1)


def get_all_path(path):
    all_path = []
    for d in os.listdir(path):
        sub = os.path.join(path, d)
        if os.path.isdir(sub):
            all_path += get_all_path(sub)
        else:
            all_path.append(sub)
    return all_path


def get_test_dataset(path, do_lower_case=False, remove_marks=False):
    paths = get_all_path(path)
    data = defaultdict(list)
    for p in paths:
        tokens = load_text(p).split()
        data['id'].append(os.path.basename(p).replace(".txt", ""))
        data['tokens'].append([processing_text(token) for token in tokens])
    return Dataset.from_dict(data)


def token_cls_metric(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = [classification_report(real, pred, output_dict=True) for real,pred in zip(true_labels, true_predictions)]
    outputs = defaultdict(list)
    for result in results:
        outputs['accuracy'].append(result['accuracy'])
        outputs['f1-score(macro)'].append(result['macro avg']['f1-score'])
        outputs['recall(macro)'].append(result['macro avg']['recall'])
        outputs['precision(macro)'].append(result['macro avg']['precision'])
    return {k:sum(v)/len(v) for k,v in outputs.items()}

## Pipelines

In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, AutoModelForQuestionAnswering,TrainingArguments, Trainer, EarlyStoppingCallback
import torch


BATCH = 4
MAX_SEQ_LEN = 1024

TRAIN_DIR = '/kaggle/input/feedback-prize-2021/train'
TRAIN = '/kaggle/input/feedback-prize-2021/train.csv'
TEST = '/kaggle/input/feedback-prize-2021/test'
SUB = '/kaggle/input/feedback-prize-2021/sample_submission'

MODEL = '../input/longformermrctrained'

tokenizer = AutoTokenizer.from_pretrained(MODEL, add_prefix_space=True)

'''
df_train = pd.read_csv(TRAIN)

# MRC
torch.cuda.empty_cache()

dataset = get_extQA_dataset(df_train)
dataset = dataset.train_test_split(0.1)

qa_train_fn = get_train_example_fn(
    tokenizer=tokenizer,
    question_column_name="question",
    context_column_name="text",
    answer_column_name="answers",
    max_seq_length=1024,
    doc_stride=None,
)

qa_eval_fn = get_val_example_fn(
    tokenizer=tokenizer,
    question_column_name="question",
    context_column_name="text",
    answer_column_name="answers",
    id_column_name='id',
    eid_column_name='discourse_id',
    max_seq_length=1024,
    doc_stride=-1,
)

import random 

train_dataset = dataset['train'].map(qa_train_fn, batched=True, remove_columns=dataset['train'].column_names)
eval_dataset = dataset['test'].map(qa_eval_fn, batched=True, remove_columns=dataset['test'].column_names).select(random.sample(range(dataset['test'].num_rows), 1000))

model = AutoModelForQuestionAnswering.from_pretrained(MODEL)

training_args = TrainingArguments(
    output_dir='./mrc',
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    report_to='tensorboard',
    learning_rate=1e-5,
    logging_steps=3000,
    eval_steps=3000,
    save_steps=3000,
    save_total_limit=3,
    num_train_epochs=3,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True
)

process_fn = get_post_process_fn(
        answer_column_name='answers',
        version_2_with_negative=False,
        n_best_size=20,
        start_n_top = 5,
        end_n_top = 5,
        max_answer_length = 30,
        null_score_diff_threshold=0.0,
        output_dir='./',
        log_level=logging.WARNING,
        prefix='eval',
        id_column_name="id",
        eid_column_name="discourse_id",
        do_beam_search=False
)


trainer = QuestionAnsweringTrainer(
    post_process_function=process_fn,
    model=model,
    args=training_args,
    compute_metrics=qa_metric,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    eval_examples=dataset['test'],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


trainer.train()
trainer.save_model('./mrc-trained/')

MODEL = './mrc-trained/'
'''
# Token Classification
torch.cuda.empty_cache()

df_train = pd.read_csv(TRAIN)
df_train['predictions']  = df_train['predictionstring'].apply(lambda s:list(map(int, s.split())))

label_list = ["O"]+[f"{prefix}-{name}" for name in sorted(df_train.discourse_type.unique()) for prefix in ("B", "I")]
label_to_ids = {label:i for i,label in enumerate(label_list)}

dataset = get_train_data(df_train, TRAIN_DIR, label_to_ids, do_lower_case=False, remove_marks=False)

example_func = get_example_fn(
    tokenizer=tokenizer, 
    max_seq_len=MAX_SEQ_LEN,
    truncation=True, 
    padding="max_length", 
    ignore_ids=-100
)

dataset = dataset.map(example_func, batched=True)

test_dataset = get_test_dataset(TEST, do_lower_case=True, remove_marks=True)
test_dataset = test_dataset.map(example_func, batched=True)

dataset = dataset.train_test_split(0.1)
data_collator = DataCollatorForTokenClassification(tokenizer)


model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    report_to='tensorboard',
    learning_rate=1e-5,
    logging_steps=3000,
    max_steps=100, # For submission
    eval_steps=3000,
    save_steps=3000,
    save_total_limit=4,
    num_train_epochs=5, # setting for inference from checkpoint
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    weight_decay=0.01,
    metric_for_best_model="f1-score(macro)",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=token_cls_metric,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.save_model("best/")
trainer.evaluate()

## Predictions

In [None]:
preds = trainer.predict(test_dataset)
preds = np.argmax(preds.predictions, axis=-1)
# bigbird - "▁", longformer - "Ġ"
preds = sub2word(preds, test_dataset, tokenizer, skip_special_tokens=True, sub_prefix="Ġ")
preds = get_predictions(preds,test_dataset['id'], label_to_ids)
df_sub = pred2sub(preds)
df_sub.to_csv('submission.csv', index=False)
df_sub