In [None]:
import pandas as pd
import gc
gc.enable()
from tqdm import tqdm
import os
test_names, test_texts = [], []
test_texts_list = []
for f in tqdm(list(os.listdir('../input/feedback-prize-2021/test'))):
    test_names.append(f.replace('.txt', ''))
    test_texts.append(open('../input/feedback-prize-2021/test/' + f, 'r', encoding='utf-8').read())
for t in test_texts:
    test_texts_list.append(t.split())
test_text_df = pd.DataFrame({'id': test_names, 'text_list': test_texts_list})

In [None]:
from datasets import Dataset

test_datasets = Dataset.from_pandas(test_text_df)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../input/longformerlstm/best_model/longformer-large-LSTM_fold0', add_prefix_space=True)
eval_datasets = {'word_ids':[], 'id':[]}
def preparing_test_dataset(examples):
    encoding = tokenizer(examples['text_list'], truncation=True, padding=False, max_length = 4096, is_split_into_words=True)
    total= len(encoding['input_ids'])
    for i in range(total):
        word_idx = encoding.word_ids(batch_index=i)
        eval_datasets['word_ids'].append(word_idx)
        eval_datasets['id'].append(examples['id'][i])
    return encoding

tokenized_test_datasets = test_datasets.map(preparing_test_dataset, batched=True, remove_columns=test_datasets.column_names)

In [None]:
from transformers import DataCollatorForTokenClassification
from dataclasses import dataclass
import torch

@dataclass
class CustomDatacollator(DataCollatorForTokenClassification):

        def torch_call(self, features):

            label_name = "label" if "label" in features[0].keys() else "labels"
            labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
            batch = dict()
            batch['input_ids'] = [feature['input_ids'] for feature in features]
            batch['attention_mask'] = [feature['attention_mask'] for feature in features]

            batch_max = max([len(ids) for ids in batch['input_ids']])
            padding_side = self.tokenizer.padding_side
            if padding_side == "right":
                batch['input_ids'] = [s + (batch_max - len(s))*[self.tokenizer.pad_token_id] for s in batch['input_ids']]
                batch['attention_mask'] = [s + (batch_max - len(s))*[0] for s in batch['attention_mask']]
            else : 
                batch['input_ids'] = [(batch_max - len(s))*[self.tokenizer.pad_token_id] + s for s in batch['input_ids']]
                batch['attention_mask'] = [(batch_max - len(s))*[0] + s for s in batch['attention_mask']]
            
            if labels is None:
                batch = {k: torch.tensor(v, dtype=torch.int) for k, v in batch.items()}
                return batch
            if padding_side == "right":
                batch[label_name] = [
                    list(label) + [self.label_pad_token_id] * (batch_max - len(label)) for label in labels
                ]
            else:
                batch[label_name] = [
                    [self.label_pad_token_id] * (batch_max - len(label)) + list(label) for label in labels
                ]
            batch = {k: torch.tensor(v, dtype=torch.int) for k, v in batch.items()}
            return batch

In [None]:
from transformers import LongformerPreTrainedModel, LongformerModel
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from collections import OrderedDict
from typing import Optional, Tuple, Any
from dataclasses import dataclass, fields
import numpy as np


_CHECKPOINT_FOR_DOC = "allenai/longformer-base-4096"
_CONFIG_FOR_DOC = "LongformerConfig"
_TOKENIZER_FOR_DOC = "LongformerTokenizer"


def is_tensor(x):
    if isinstance(x, torch.Tensor):
        return True
    return isinstance(x, np.ndarray)

class ModelOutput(OrderedDict):
    """
    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
    python dictionary.
    <Tip warning={true}>
    You can't unpack a `ModelOutput` directly. Use the [`~file_utils.ModelOutput.to_tuple`] method to convert it to a
    tuple before.
    </Tip>
    """

    def __post_init__(self):
        class_fields = fields(self)

        # Safety and consistency checks
        if not len(class_fields):
            raise ValueError(f"{self.__class__.__name__} has no fields.")
        if not all(field.default is None for field in class_fields[1:]):
            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")

        first_field = getattr(self, class_fields[0].name)
        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])

        if other_fields_are_none and not is_tensor(first_field):
            if isinstance(first_field, dict):
                iterator = first_field.items()
                first_field_iterator = True
            else:
                try:
                    iterator = iter(first_field)
                    first_field_iterator = True
                except TypeError:
                    first_field_iterator = False

            # if we provided an iterator as first field and the iterator is a (key, value) iterator
            # set the associated fields
            if first_field_iterator:
                for element in iterator:
                    if (
                        not isinstance(element, (list, tuple))
                        or not len(element) == 2
                        or not isinstance(element[0], str)
                    ):
                        break
                    setattr(self, element[0], element[1])
                    if element[1] is not None:
                        self[element[0]] = element[1]
            elif first_field is not None:
                self[class_fields[0].name] = first_field
        else:
            for field in class_fields:
                v = getattr(self, field.name)
                if v is not None:
                    self[field.name] = v

    def __delitem__(self, *args, **kwargs):
        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")

    def setdefault(self, *args, **kwargs):
        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")

    def pop(self, *args, **kwargs):
        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")

    def update(self, *args, **kwargs):
        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")

    def __getitem__(self, k):
        if isinstance(k, str):
            inner_dict = {k: v for (k, v) in self.items()}
            return inner_dict[k]
        else:
            return self.to_tuple()[k]

    def __setattr__(self, name, value):
        if name in self.keys() and value is not None:
            # Don't call self.__setitem__ to avoid recursion errors
            super().__setitem__(name, value)
        super().__setattr__(name, value)

    def __setitem__(self, key, value):
        # Will raise a KeyException if needed
        super().__setitem__(key, value)
        # Don't call self.__setattr__ to avoid recursion errors
        super().__setattr__(key, value)

    def to_tuple(self) -> Tuple[Any]:
        """
        Convert self to a tuple containing all the attributes/keys that are not `None`.
        """
        return tuple(self[k] for k in self.keys())

@dataclass
class LongformerTokenClassifierOutput(ModelOutput):
    """
    Base class for outputs of token classification models.
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
            Classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
            Local attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token in the sequence to every token with
            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
            If the attention window contains a token with global attention, the attention weight at the corresponding
            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
            accessed from `global_attentions`.
        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
            where `x` is the number of tokens with global attention mask.
            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    global_attentions: Optional[Tuple[torch.FloatTensor]] = None

class CustomLongformerForTokenClassification(LongformerPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.longformer = LongformerModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.lstm = nn.LSTM(input_size= config.hidden_size, hidden_size= config.hidden_size, num_layers= 2, dropout= 0.1, batch_first= True, bidirectional= True)
        self.LayerNorm_LSTM = nn.LayerNorm(config.hidden_size*2, eps=config.layer_norm_eps)
        self.dropout_lstm = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.classifier = nn.Linear(config.hidden_size*2, config.num_labels)

        # Initialize weights and apply final processing
#         self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        global_attention_mask=None,
        head_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.longformer(
            input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            head_mask=head_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output,_ = self.lstm(sequence_output)
        sequence_output = self.LayerNorm_LSTM(sequence_output)
        logits = self.classifier(sequence_output)
        
        loss = None

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return LongformerTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            global_attentions=outputs.global_attentions,
        )

In [None]:
from transformers import AutoModelForTokenClassification, AutoConfig, Trainer, DataCollatorForTokenClassification, TrainingArguments
import numpy as np
import torch
data_collator = CustomDatacollator(tokenizer)
all_predictions = []
training_args = TrainingArguments(per_device_eval_batch_size=4, output_dir = '../input')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
for fold in range(5):
    model_path = '../input/longformerlstm/best_model/longformer-large-LSTM_fold'+str(fold)+'/'
    config = AutoConfig.from_pretrained(model_path)
    model = CustomLongformerForTokenClassification.from_pretrained(model_path, config = config)
    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset=None,
        eval_dataset=tokenized_test_datasets,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    predictions, _, _ = trainer.predict(test_dataset = tokenized_test_datasets)
    print("shape of prediciton", predictions.shape)
    current_idx = 0
    for preds in predictions:
        preds = preds.astype(np.float32)
        preds = preds/5
        if fold==0:
            all_predictions.append(preds)
        else:
            all_predictions[current_idx] += preds
            current_idx +=1
    torch.cuda.empty_cache()
    gc.collect()
all_predictions = np.array(all_predictions)

In [None]:
from collections import defaultdict

def label_dict():
    train = pd.read_csv('../input/feedback-prize-2021/train.csv')
    classes = train.discourse_type.unique().tolist()
    tags = defaultdict()
    for i, c in enumerate(classes):
        tags[f'B-{c}'] = i
        tags[f'I-{c}'] = i + len(classes)
    tags[f'O'] = len(classes) * 2
    tags[f'Special'] = -100
    l2i = dict(tags)
    i2l = defaultdict()
    for k, v in l2i.items(): 
        i2l[v] = k
    i2l[-100] = 'Special'
    i2l = dict(i2l)
    N_LABELS = len(i2l) - 1 # not accounting for -100
    return i2l, l2i, N_LABELS

In [None]:
def link_evidence(oof):
  if not len(oof):
    return oof
  
  def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])
  
  thresh = 1
  idu = oof['id'].unique()
  eoof = oof[oof['class'] == "Evidence"]
  neoof = oof[oof['class'] != "Evidence"]
  eoof.index = eoof[['id', 'class']]
  for thresh2 in range(26, 27, 1):
    retval = []
    for idv in tqdm(idu, desc='link_evidence', leave=False):
      for c in ['Evidence']:
        q = eoof[(eoof['id'] == idv)]
        if len(q) == 0:
          continue
        pst = []
        for r in q.itertuples():
          pst = [*pst, -1,  *[int(x) for x in r.predictionstring.split()]]
        start = 1
        end = 1
        for i in range(2, len(pst)):
          cur = pst[i]
          end = i
          if  ((cur == -1) and ((pst[i + 1] > pst[end - 1] + thresh) or (pst[i + 1] - pst[start] > thresh2))):
            retval.append((idv, c, jn(pst, start, end)))
            start = i + 1
        v = (idv, c, jn(pst, start, end + 1))
        retval.append(v)
    roof = pd.DataFrame(retval, columns=['id', 'class', 'predictionstring'])
    roof = roof.merge(neoof, how='outer')
    return roof

In [None]:
def find_max_label(word_prediction_score):
    x = np.sum(word_prediction_score, axis=0)
    max_label = np.argmax(x, axis=-1)
    return max_label

In [None]:
def token_to_word(pred_score, word_id_list, i2l):
  all_prediction = []
  all_pred_score=[]
  for k, label_pred_score in tqdm(enumerate(pred_score), desc = "post-processing"):
    each_prediction = []
    each_prediction_score = []
    word_ids = word_id_list[k]
    previous_word_idx = -1
    word_prediction_score=[]
    for idx, word_idx in enumerate(word_ids):
      if word_idx == None:
        continue
      elif word_idx != previous_word_idx:
        if len(word_prediction_score)!=0:
          # find label which have the most score label following each tokens including in one word
          max_label = find_max_label(word_prediction_score)
          word_prediction_score = [word_prediction_score[i][max_label] for i in range(len(word_prediction_score))]
          each_prediction_score.append(word_prediction_score)
          each_prediction.append(i2l[max_label])
        previous_word_idx = word_idx
        word_prediction_score=[]
        word_prediction_score.append(label_pred_score[idx])
      else:
        word_prediction_score.append(label_pred_score[idx])
    max_label = find_max_label(word_prediction_score)
    word_prediction_score = [word_prediction_score[i][max_label] for i in range(len(word_prediction_score))]  
    each_prediction_score.append(word_prediction_score)
    each_prediction.append(i2l[max_label])
    all_prediction.append(each_prediction)
    all_pred_score.append(each_prediction_score)
  return all_prediction, all_pred_score

In [None]:
def postprocess_fb_predictions2(
    eval_datasets,
    predictions,
):
    proba_thresh = {
        "Lead": 0.687,
        "Position": 0.537,
        "Evidence": 0.637,
        "Claim": 0.537,
        "Concluding Statement": 0.687,
        "Counterclaim": 0.537,
        "Rebuttal": 0.537,
    }
    #discourse length threshold
    min_thresh = {
        "Lead": 9,
        "Position": 5,
        "Evidence": 14,
        "Claim": 3,
        "Concluding Statement": 11,
        "Counterclaim": 6,
        "Rebuttal": 4,
    }
    print(predictions.shape)
    softmax = torch.nn.Softmax(dim=-1)
    predictions = torch.tensor(predictions)
    pred_score = softmax(predictions)
    pred_score = pred_score.numpy()
    i2l, _, _ = label_dict()
    word_id_list = list(eval_datasets['word_ids'])
    all_prediction, all_pred_score = token_to_word(pred_score, word_id_list, i2l)
    final_pred = []
    for i in range(len(eval_datasets['id'])):
      idx = eval_datasets['id'][i]
      pred = all_prediction[i]
      pred_score = all_pred_score[i]
      j=0
      while j < len(pred):
        cls = pred[j]
        if cls =='O': 
            j+=1
        else: 
            cls = cls.replace('B', 'I')
        end = j+1
        while end < len(pred) and pred[end] == cls:
            end +=1
        final_pred_score = []
        for item in pred_score[j:end]:
          final_pred_score.extend(item)
        if cls != 'O' and cls!='' and sum(final_pred_score)/len(final_pred_score)>=proba_thresh[cls.replace('I-', '')] and end-j>=min_thresh[cls.replace('I-', '')]:
            final_pred.append((idx, cls.replace('I-', ''), ' '.join(map(str, list(range(j, end))))))
        j = end
    oof = pd.DataFrame(final_pred)
    oof.columns = ['id', 'class', 'predictionstring']
    
    oof = link_evidence(oof)
    return oof

In [None]:
oof = postprocess_fb_predictions2(eval_datasets=eval_datasets, predictions=all_predictions)

In [None]:
oof.to_csv("submission.csv", index=False)