The following cell is only necessary if you want to use the fast tokenizer for deberta-v2

In [None]:
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

import shutil

input_dir = Path("../input/debertav2xlfasttokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
def get_DebertaV2TokenizerFast(model_name):        
    from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
    return DebertaV2TokenizerFast.from_pretrained(model_name)



In [None]:
import os
import json
import math
from pathlib import Path
import gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
import copy
import torch.cuda.amp as AMP
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoConfig,
)

In [None]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}
id_target_map = {v: k for k, v in target_id_map.items()}

In [None]:
class FeedbackDatasetValid:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }


class Collate:
    def __init__(self, tokenizer, fix_length=-1):
        self.tokenizer = tokenizer
        self.fix_length = fix_length

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["ids"]])
        if self.fix_length != -1:
            batch_max = min(batch_max, self.fix_length)
#             batch_max = self.fix_length

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output


In [None]:
class TrainerConfig(object):
    def __init__(self):
        self.model_load = None
        self.model_name = None
        self.valid_batch_size = 4
        self.fix_length = 2048


class ModelConfig(object):
    def __init__(self):
        self.pretrain_path = None
        self.hidden_dropout_prob = 0.1
        self.layer_norm_eps = 1e-7
        self.num_labels = 15
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.dropout = 0.1

In [None]:
class Predicter(object):
    def __init__(self, args):
        self.trainer_config = TrainerConfig()
        self.model_config = ModelConfig()
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def set_pretrain(self, pretrain_path):
        self.model_config.pretrain_path = pretrain_path
    
    def build_model(self):
        self.model = FeedBackModel(self.model_config)
    
    def model_init(self):
        self.build_model()
        if self.trainer_config.model_load:
            self.model.load_state_dict(torch.load(self.trainer_config.model_load, map_location=torch.device('cpu')))
        self.model.to(self.device)
        self.model.eval()
    
    def get_logits(self, batch, return_loss=False):
        input_ids = batch["ids"].to(self.device)
        attention_mask = batch["mask"].to(self.device)
        if return_loss:
            labels = batch["targets"].to(self.device)
            logits, loss = self.model(input_ids, attention_mask, labels=labels)
            return logits, loss
        else:
            logits, _ = self.model(input_ids, attention_mask)
            return logits

    def model_load(self, path):
        self.model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
        self.model.to(self.device)
        self.model.eval()
        
    @torch.no_grad()
    def predict(self, valid_datasets, collate):
        self.model.eval()
        valid_iter = torch.utils.data.DataLoader(valid_datasets, batch_size=self.trainer_config.valid_batch_size, collate_fn=collate)
        preds = []
        PAD = torch.tensor([0.0] * 14 + [0.1], dtype=torch.float).unsqueeze(0)
        for batch in tqdm(valid_iter):
            with AMP.autocast(enabled=True):
                pred = self.get_logits(batch).cpu()
                bs, length, dim = pred.shape
                batch_pad = torch.cat([PAD] * bs, dim=0).unsqueeze(1)
                pred = torch.cat([pred] + [batch_pad] * (self.trainer_config.fix_length - length), dim=1)
                preds.append((pred*255).byte().data.cpu().numpy())
        del valid_iter
        gc.collect()
        return preds

In [None]:
class FeedBackModel(nn.Module):
    def __init__(self, args):
        super(FeedBackModel, self).__init__()
        config = AutoConfig.from_pretrained(args.pretrain_path)
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": args.hidden_dropout_prob,
                "layer_norm_eps": args.layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": args.num_labels,
            }
        )
        self.num_labels = args.num_labels
        self.transformer = AutoModel.from_config(config)
        self.dropout = nn.Dropout(args.dropout)
        self.output = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        if token_type_ids:
            transformer_out = self.transformer(input_ids, attention_mask, token_type_ids)
        else:
            transformer_out = self.transformer(input_ids, attention_mask)
        sequence_output = transformer_out.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.output(sequence_output)
        logits_out = torch.softmax(logits, dim=-1)
        loss = 0
        return logits_out, loss

In [None]:
from joblib import Parallel, delayed


def _prepare_test_data_helper(tokenizer, ids):
    test_samples = []
    for idx in ids:
        filename = os.path.join("../input/feedback-prize-2021", "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(df, tokenizer):
    test_samples = []
    ids = df["id"].unique()
    ids_splits = np.array_split(ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples


def prepare_test_data_v2(df, tokenizer):
    test_samples = []
    ids = df["id"].unique()
    for idx in ids:
        filename = os.path.join("../input/feedback-prize-2021", "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples

In [None]:
def text_to_word(text):
    word = text.split()
    word_offset = []

    start = 0
    for w in word:
        r = text[start:].find(w)

        if r==-1:
            raise NotImplementedError
        else:
            start = start+r
            end   = start+len(w)
            word_offset.append((start,end))
            #print('%32s'%w, '%5d'%start, '%5d'%r, text[start:end])
        start = end

    return word, word_offset


def word_probability_to_predict_df(text_to_word_probability, id):
    len_word = len(text_to_word_probability)
    word_predict = text_to_word_probability.argmax(-1)
    word_score   = text_to_word_probability.max(-1)
    #########################################################
#     for i in range(len_word):
#         if text_to_word_probability[i][10] >= 0.40:
#             word_predict[i] = 10
#             word_score[i] = text_to_word_probability[i][10]
#         if text_to_word_probability[i][11] >= 0.40:
#             word_predict[i] = 11
#             word_score[i] = text_to_word_probability[i][11]
#         if text_to_word_probability[i][12] >= 0.40:
#             word_predict[i] = 12
#             word_score[i] = text_to_word_probability[i][12]
#         if text_to_word_probability[i][13] >= 0.40:
#             word_predict[i] = 13
#             word_score[i] = text_to_word_probability[i][13]
    #########################################################
    predict_df = []

    t = 0
    # logging.debug(target_id_map)
    while 1:
        if word_predict[t] not in [
            target_id_map['O'],
            target_id_map['PAD'],
        ]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t+1
            if t== len_word-1: break
            continue

        t = t+1
        if t== len_word-1: break

        #----
        if id_target_map[b_marker_label][0]=='B':
            i_marker_label = b_marker_label+1
        elif id_target_map[b_marker_label][0]=='I':
            i_marker_label = b_marker_label
        else:
            raise NotImplementedError

        while 1:
            #print(t)
            if (word_predict[t] != i_marker_label) or (t ==len_word-1):
                end = t
                prediction_string = ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()
                discourse_type = id_target_map[b_marker_label][2:]
                discourse_score = word_score[start:end].tolist()
                predict_df.append((id, discourse_type, prediction_string, str(discourse_score)))
                #print(predict_df[-1])
                break
            else:
                t = t+1
                continue
        if t== len_word-1: break

    predict_df = pd.DataFrame(predict_df, columns=['id', 'class', 'predictionstring', 'score'])
    return predict_df

def word_probability_to_prediction_string(text_to_word_probability, text_id, word):
    # print(11)
    length_threshold = {
        "Lead": 9,
        "Position": 5,
        "Evidence": 14,
        "Claim": 3,
        "Concluding Statement": 11,
        "Counterclaim": 6,
        "Rebuttal": 4,
    }
    word_predict = text_to_word_probability.argmax(-1)
    word_score = text_to_word_probability.max(-1)
    predict_df = []

    t = 0
    while 1:
        # if word_predict[t] in [1,3,5,7,9,11,13]:
        if t == len(word):
            break

        if word_predict[t] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t + 1
            if t == len(word) - 1:
                break
            continue

        t = t + 1
        if t == len(word) - 1:
            break

        # i_marker_label = b_marker_label+1
        i_marker_label = [b_marker_label +
                          1] if b_marker_label % 2 == 0 else [b_marker_label]
        marker_text = id_target_map[i_marker_label[0]]

        # modified1
        consecutive_list = ['Lead', 'Position', 'Concluding', 'Rebuttal']
        if any([x in marker_text for x in consecutive_list]):
            i_marker_label.append(i_marker_label[0] - 1)
            # print(i_marker_label)
        # i_marker_label = [b_marker_label,b_marker_label+1] if b_marker_label%2==1 else [b_marker_label-1,b_marker_label]

        total_others_count = 0
        cur_others_count = 0
        tolerance = 0
        while 1:
            # print(t)
            if t < len(word) and word_predict[t] not in i_marker_label and total_others_count < tolerance:
                total_others_count += 1
                cur_others_count += 1
                t += 1
            elif t == len(word) or (word_predict[t] not in i_marker_label):
                t -= cur_others_count
                end = t
                # have bug here
                # ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()
                if 20 > end - start > 7:
                    prediction_string = [i for i in range(start, end + 1 if end != len(word) else end)]
                    # ' '.join(word[i] for i in range(start, end))
                    prediction_text = [word[i] for i in range(start, end + 1 if end != len(word) else end)]
                else:
                    prediction_string = [i for i in range(start, end)]
                    # ' '.join(word[i] for i in range(start, end))
                    prediction_text = [word[i] for i in range(start, end)]
                discourse_type = id_target_map[b_marker_label][2:]
#                 # modified 5
#                 if end == start or end == start + 1:
#                     discourse_score = [word_score[start]]
#                 elif end == start + 2:  # length = 2
#                     # + [np.mean(word_score[start: end])]
#                     discourse_score = word_score[start: end - 1].tolist()
#                 else:
#                     # + [np.mean(word_score[start: end])]
#                     discourse_score = word_score[start: end - 1].tolist() + [np.mean(word_score[start:end])]

                discourse_score = word_score[start: end].tolist()
                # 将Concluding延长到最后一个词语
                # if 'Concluding' in discourse_type and len(word) - 1 > t >= len(word) - 3:
                # 	print(discourse_type)
                # 	t += 1

                predict_df.append(
                    (text_id, discourse_type, prediction_text, prediction_string, discourse_score))
                # print(predict_df[-1])
                break
            else:
                cur_others_count = 0
                t = t + 1
                continue
        if t == len(word) - 1:
            break

    # modified 3 keep Lead
    filtered_predict_df = list(
        filter(lambda x: 'Lead' not in x[1], predict_df))
    lead_df = list(filter(lambda x: 'Lead' in x[1], predict_df))
    min_lead_score = 0.95
    if len(lead_df) > 1:
        lead_df = sorted(lead_df, key=lambda x: np.mean(x[4]), reverse=True)
        lead_df = [lead_df[0]] + \
                  list(filter(lambda x: np.mean(x[4]) > min_lead_score, lead_df[1:]))
        begin = min([x[3][0] for x in lead_df])
        end = max([x[3][-1] for x in lead_df])
        lead_df = [(lead_df[0][0], lead_df[0][1], [word[i] for i in range(
            begin, end + 1)], [i for i in range(begin, end + 1)], word_score[begin:end + 1].tolist())]
        predict_df = lead_df + filtered_predict_df

    # modified4 keep Concluding
    filtered_predict_df = list(
        filter(lambda x: 'Concluding' not in x[1], predict_df))
    con_df = list(filter(lambda x: 'Concluding' in x[1], predict_df))
    min_con_score = 0.7
    if len(con_df) > 1:
        con_df = sorted(con_df, key=lambda x: np.mean(x[4]), reverse=True)
        # + list(filter(lambda x: np.mean(x[4]) > min_con_score, con_df[1:]))
        con_df = con_df[:2]
        begin = min(con_df[0][3][0], con_df[1][3][0])
        end = max(con_df[0][3][-1], con_df[1][3][-1])
        con_df = [(con_df[0][0], con_df[0][1], [word[i] for i in range(
            begin, end + 1)], [i for i in range(begin, end + 1)], word_score[begin + 1:end].tolist())]
        predict_df = filtered_predict_df + con_df

#     # modified6 keep Position
#     filtered_predict_df = list(
#         filter(lambda x: 'Position' not in x[1], predict_df))
#     pos_df = list(filter(lambda x: 'Position' in x[1] and len(
#         x[2]) > length_threshold['Position'], predict_df))
#     min_pos_score = 0.9
#     if len(pos_df) > 1:
#         pos_df = sorted(pos_df, key=lambda x: np.mean(x[4]), reverse=True)
#         pos_df = pos_df[:1] + \
#                  list(filter(lambda x: np.mean(x[4]) > min_pos_score, pos_df[1:]))
#         if len(pos_df) >= 2:
#             pos_df = sorted(pos_df, key=lambda x: x[3], reverse=False)
#         # begin = min(pos_df[0][3][0], pos_df[1][3][0])
#         # end = max(pos_df[0][3][-1], pos_df[1][3][-1])
#         # pos_df = [(pos_df[0][0], pos_df[0][1], [word[i] for i in range(begin, end + 1)], [i for i in range(begin,end+1)], word_score[begin:end].tolist())]
#         predict_df = filtered_predict_df + pos_df

    for i in range(len(predict_df)):
        predict_df[i] = (predict_df[i][0], predict_df[i][1], ' '.join(
            predict_df[i][2]), ' '.join(str(x) for x in predict_df[i][3]), str(predict_df[i][4]))
    predict_df = pd.DataFrame(predict_df, columns=[
        'id', 'class', 'predict_text', 'predictionstring', 'score'])
    return predict_df


def word_probability_to_prediction_string_v4(text_to_word_probability, text_id, word):
    # print(11)
    length_threshold = {
        "Lead": 9,
        "Position": 5,
        "Evidence": 14,
        "Claim": 3,
        "Concluding Statement": 11,
        "Counterclaim": 6,
        "Rebuttal": 4,
    }
    word_predict = text_to_word_probability.argmax(-1)
    word_score = text_to_word_probability.max(-1)
    predict_df = []

    t = 0
    while 1:
        # if word_predict[t] in [1,3,5,7,9,11,13]:
        if t == len(word):
            break

        if word_predict[t] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t + 1
            if t == len(word) - 1:
                break
            continue

        t = t + 1
        if t == len(word) - 1:
            break

        # i_marker_label = b_marker_label+1
        i_marker_label = [b_marker_label +
                          1] if b_marker_label % 2 == 0 else [b_marker_label]
        marker_text = id_target_map[i_marker_label[0]]

        total_others_count = 0
        cur_others_count = 0
        tolerance_cur = 0
        tolerance_total = 0
        # modified1
        consecutive_list = ['Lead', 'Position', 'Concluding', 'Rebuttal']
        if any([x in marker_text for x in consecutive_list]):
            i_marker_label.append(i_marker_label[0] - 1)
            # if any([x in marker_text for x in consecutive_list]):
            # # modified 7 rebuttal tolerance
        #     if 'Rebuttal' in marker_text:
        #         tolerance_cur = 7
        #         tolerance_total = 15
        # # i_marker_label = [b_marker_label,b_marker_label+1] if b_marker_label%2==1 else [b_marker_label-1,b_marker_label]

        while 1:
            # print(t)
            if t < len(word) and word_predict[t] not in i_marker_label and total_others_count < tolerance_total and cur_others_count < tolerance_cur:
                total_others_count += 1
                cur_others_count += 1
                t += 1
            elif t == len(word) or (word_predict[t] not in i_marker_label):
                t -= cur_others_count
                end = t
                # ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()

                # modified 6
                if 20 > end - start > 5:
                    prediction_string = [i for i in range(start, end + 1 if end != len(word) else end)]
                    # ' '.join(word[i] for i in range(start, end))
                    prediction_text = [word[i] for i in range(start, end + 1 if end != len(word) else end)]
                else:
                    prediction_string = [i for i in range(start, end)]
                    # ' '.join(word[i] for i in range(start, end))
                    prediction_text = [word[i] for i in range(start, end)]

                # extend

                # prediction_string = [i for i in range(start, end)]
                # prediction_text = [word[i] for i in range(start, end)]


                discourse_type = id_target_map[b_marker_label][2:]
#                 # modified 5
#                 if end == start or end == start + 1:
#                     discourse_score = [word_score[start]]
#                 elif end == start + 2:  # length = 2
#                     # + [np.mean(word_score[start: end])]
#                     discourse_score = word_score[start: end - 1].tolist()
#                 else:
#                     # + [np.mean(word_score[start: end])]
#                     discourse_score = word_score[start: end - 1].tolist() + [np.mean(word_score[start:end])]

                discourse_score = word_score[start: end].tolist()
                # 将Concluding延长到最后一个词语

                predict_df.append(
                    (text_id, discourse_type, prediction_text, prediction_string, discourse_score))
                # print(predict_df[-1])
                break
            else:
                cur_others_count = 0
                t = t + 1
                continue
        if t == len(word) - 1:
            break

    # modified 3 keep Lead
    filtered_predict_df = list(
        filter(lambda x: 'Lead' not in x[1], predict_df))
    lead_df = list(filter(lambda x: 'Lead' in x[1], predict_df))
    min_lead_score = 0.95
    if len(lead_df) > 1:
        lead_df = sorted(lead_df, key=lambda x: np.mean(x[4]), reverse=True)
        lead_df = [lead_df[0]] + \
                  list(filter(lambda x: np.mean(x[4]) > min_lead_score, lead_df[1:]))
        begin = min([x[3][0] for x in lead_df])
        end = max([x[3][-1] for x in lead_df])
        lead_df = [(lead_df[0][0], lead_df[0][1], [word[i] for i in range(
            begin, end + 1)], [i for i in range(begin, end + 1)], word_score[begin:end + 1].tolist())]
        predict_df = lead_df + filtered_predict_df

    # modified4 keep Concluding
    filtered_predict_df = list(
        filter(lambda x: 'Concluding' not in x[1], predict_df))
    con_df = list(filter(lambda x: 'Concluding' in x[1], predict_df))
    min_con_score = 0.7
    if len(con_df) > 1:
        con_df = sorted(con_df, key=lambda x: np.mean(x[4]), reverse=True)
        # + list(filter(lambda x: np.mean(x[4]) > min_con_score, con_df[1:]))
        con_df = con_df[:2]
        begin = min(con_df[0][3][0], con_df[1][3][0])
        end = max(con_df[0][3][-1], con_df[1][3][-1])
        con_df = [(con_df[0][0], con_df[0][1], [word[i] for i in range(
            begin, end + 1)], [i for i in range(begin, end + 1)], word_score[begin + 1:end].tolist())]
        predict_df = filtered_predict_df + con_df

    # # modified 8 keep Position
    # filtered_predict_df = list(
    #     filter(lambda x: 'Position' not in x[1], predict_df))
    # pos_df = list(filter(lambda x: 'Position' in x[1] and len(
    #     x[2]) > length_threshold['Position'], predict_df))
    # min_pos_score = 0.9
    # if len(pos_df) > 1:
    #     pos_df = sorted(pos_df, key=lambda x: np.mean(x[4]), reverse=True)
    #     pos_df = pos_df[:1] + \
    #              list(filter(lambda x: np.mean(x[4]) > min_pos_score, pos_df[1:]))
    #     if len(pos_df) >= 2:
    #         pos_df = sorted(pos_df, key=lambda x: x[3], reverse=False)
    #     # begin = min(pos_df[0][3][0], pos_df[1][3][0])
    #     # end = max(pos_df[0][3][-1], pos_df[1][3][-1])
    #     # pos_df = [(pos_df[0][0], pos_df[0][1], [word[i] for i in range(begin, end + 1)], [i for i in range(begin,end+1)], word_score[begin:end].tolist())]
    #     predict_df = filtered_predict_df + pos_df

    for i in range(len(predict_df)):
        predict_df[i] = (predict_df[i][0], predict_df[i][1], ' '.join(
            predict_df[i][2]), ' '.join(str(x) for x in predict_df[i][3]), str(predict_df[i][4]))
    predict_df = pd.DataFrame(predict_df, columns=[
        'id', 'class', 'predict_text', 'predictionstring', 'score'])
    return predict_df


def word_probability_to_prediction_string_v5(text_to_word_probability, text_id, word, convert_table):
    # print(11)

    word_predict = text_to_word_probability.argmax(-1)
    word_score = text_to_word_probability.max(-1)
    predict_df = []

    t = 0
    while 1:
        # if word_predict[t] in [1,3,5,7,9,11,13]:
        if t == len(word):
            break

        if word_predict[t] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t + 1
            if t == len(word) - 1:
                break
            continue

        t = t + 1
        if t == len(word) - 1:
            break

        # i_marker_label = b_marker_label+1
        i_marker_label = [b_marker_label + 1] if b_marker_label % 2 == 0 else [b_marker_label]
        marker_text = id_target_map[i_marker_label[0]]

        total_others_count = 0
        cur_others_count = 0
        tolerance_cur = 0
        tolerance_total = 0
        # modified1
        consecutive_list = ['Lead', 'Position', 'Concluding', 'Rebuttal']
        if any([x in marker_text for x in consecutive_list]):
            i_marker_label.append(i_marker_label[0] - 1)
            # if any([x in marker_text for x in consecutive_list]):
            # modified 7 rebuttal tolerance
            # if 'Rebuttal' in marker_text:
            #     tolerance_cur = 6
            #     tolerance_total = 15
            # #         'Rebuttal': ('0.5308', '0.6101', '0.4697')
            # # [7, 15] 'Rebuttal': ('0.5249', '0.6109', '0.4600')
            # # [4, 15] 'Rebuttal': ('0.5241', '0.6090', '0.4600')
            # # [6, 15] 

        # # i_marker_label = [b_marker_label,b_marker_label+1] if b_marker_label%2==1 else [b_marker_label-1,b_marker_label]

        while 1:
            # print(t)
            if t < len(word) and word_predict[t] not in i_marker_label and total_others_count < tolerance_total and cur_others_count < tolerance_cur:
                total_others_count += 1
                cur_others_count += 1
                t += 1
            elif t == len(word) or (word_predict[t] not in i_marker_label):
                t -= cur_others_count
                end = t
                # ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()

                # # modified 6
                if 20 > end - start > 5:
                    prediction_string = [i for i in range(start, end + 1 if end != len(word) else end)]
                    # ' '.join(word[i] for i in range(start, end))
                    prediction_text = [word[i] for i in range(start, end + 1 if end != len(word) else end)]
                else:
                    prediction_string = [i for i in range(start, end)]
                    # ' '.join(word[i] for i in range(start, end))
                    prediction_text = [word[i] for i in range(start, end)]

                # prediction_string = [i for i in range(start, end)]
                # prediction_text = [word[i] for i in range(start, end)]

                discourse_type = id_target_map[b_marker_label][2:]

                discourse_score = word_score[start: end].tolist()
                # 将Concluding延长到最后一个词语

                predict_df.append((text_id, discourse_type, prediction_text, prediction_string, discourse_score))
                # print(predict_df[-1])
                break
            else:
                cur_others_count = 0
                t = t + 1
                continue
        if t == len(word) - 1:
            break

    # modified 3 keep Lead
    filtered_predict_df = list(
        filter(lambda x: 'Lead' not in x[1], predict_df))
    lead_df = list(filter(lambda x: 'Lead' in x[1], predict_df))
    min_lead_score = 0.95
    if len(lead_df) > 1:
        lead_df = sorted(lead_df, key=lambda x: np.mean(x[4]), reverse=True)
        lead_df = [lead_df[0]] + \
                  list(filter(lambda x: np.mean(x[4]) > min_lead_score, lead_df[1:]))
        begin = min([x[3][0] for x in lead_df])
        end = max([x[3][-1] for x in lead_df])
        lead_df = [(lead_df[0][0], lead_df[0][1], [word[i] for i in range(
            begin, end + 1)], [i for i in range(begin, end + 1)], word_score[begin:end + 1].tolist())]
        predict_df = lead_df + filtered_predict_df

    # modified4 keep Concluding
    filtered_predict_df = list(
        filter(lambda x: 'Concluding' not in x[1], predict_df))
    con_df = list(filter(lambda x: 'Concluding' in x[1], predict_df))
    min_con_score = 0.7
    if len(con_df) > 1:
        con_df = sorted(con_df, key=lambda x: np.mean(x[4]), reverse=True)
        # + list(filter(lambda x: np.mean(x[4]) > min_con_score, con_df[1:]))
        con_df = con_df[:2]
        begin = min(con_df[0][3][0], con_df[1][3][0])
        end = max(con_df[0][3][-1], con_df[1][3][-1])
        con_df = [(con_df[0][0], con_df[0][1], [word[i] for i in range(
            begin, end + 1)], [i for i in range(begin, end + 1)], word_score[begin + 1:end].tolist())]
        predict_df = filtered_predict_df + con_df

    # convert_table = {
    #     # 'Lead': {'Lead': 0.9, 'Position': 1.1, 'Evidence': 0.95, 'Claim': 0.98, 'Concluding Statement': 0.9, 'Counterclaim': 0.93, 'Rebuttal': 0.9,},
    #     'Position': {'Lead': 0.9, 'Position': 0.9, 'Evidence': 1., 'Claim': 1.1, 'Concluding Statement': 0.95, 'Counterclaim': 0.94, 'Rebuttal': 0.9,},
    #     # 'Evidence': {'Lead': 0.9, 'Position': 0.95, 'Evidence': 1., 'Claim': 1.1, 'Concluding Statement': 1.1, 'Counterclaim': 1., 'Rebuttal': 0.95,},
    #     # 'Claim': {'Lead': 0.94, 'Position': 0.95, 'Evidence': 1.2, 'Claim': 1.1, 'Concluding Statement': 1, 'Counterclaim': 1, 'Rebuttal': 0.9,},
    #     # 'Concluding Statement': {'Lead': 0.9, 'Position': 0.94, 'Evidence': 1., 'Claim': 1.1, 'Concluding Statement': 0.95, 'Counterclaim': 0.94, 'Rebuttal': 0.9,},
    #     # 'Counterclaim': {'Lead': 0.9, 'Position': 1.1, 'Evidence': 1., 'Claim': 1., 'Concluding Statement': 0.95, 'Counterclaim': 1., 'Rebuttal': 0.9,},
    #     # 'Rebuttal': {'Lead': 0.9, 'Position': 0.95, 'Evidence': 1.1, 'Claim': 1., 'Concluding Statement': 1., 'Counterclaim': 0.95, 'Rebuttal': 0.92,},
    # }
#     min_thresh = {
#         "Lead": 9,
#         "Position": 5,
#         "Evidence": 14,
#         "Claim": 3,
#         "Concluding Statement": 11,
#         "Counterclaim": 6,
#         "Rebuttal": 4,
#     }
#     proba_thresh = {
#         "Lead": 0.617628220048235, # 0.7
#         "Position": 0.5404662917593531, # 0.55
#         "Evidence": 0.5792470568116815, # 0.65
#         "Claim": 0.5385829262728876, # 0.55
#         "Concluding Statement": 0.6235012425556871, # 0.7
#         "Counterclaim": 0.4975126082187205, # 0.5
#         "Rebuttal": 0.5444709754299981, # 0.55
#     }
    min_thresh = {
        "Lead": 8, # 9
        "Position": 4, # 5
        "Evidence": 8, # 14
        "Claim": 1, # 3
        "Concluding Statement": 10, # 11
        "Counterclaim": 9,
        "Rebuttal": 2, # 4
    }
    proba_thresh = {
        "Lead": 0.5647964444385352,
        "Position": 0.6211823905704472,
        "Evidence": 0.6037415312070282,
        "Claim": 0.5655015619409717,
        "Concluding Statement": 0.5605916604200145,
        "Counterclaim": 0.5589241228663976,
        "Rebuttal": 0.6279143972926252
    }
    predict_df = sorted(predict_df, key=lambda x: x[3][0])
    for i, predict in enumerate(predict_df[:-1]):
        cur_class = predict[1]
        if cur_class in convert_table.keys() and len(predict[3]) >= min_thresh[cur_class] and np.mean(predict[-1]) > proba_thresh[cur_class]:
            new_score = (np.array(predict_df[i+1][-1]) * convert_table[cur_class][predict_df[i+1][1]]).tolist()
            predict_df[i+1] = (predict_df[i+1][0], predict_df[i+1][1], predict_df[i+1][2], predict_df[i+1][3], new_score)

    for i in range(len(predict_df)):
        predict_df[i] = (predict_df[i][0], predict_df[i][1], ' '.join(
            predict_df[i][2]), ' '.join(str(x) for x in predict_df[i][3]), str(predict_df[i][4]))
    predict_df = pd.DataFrame(predict_df, columns=[
        'id', 'class', 'predict_text', 'predictionstring', 'score'])
    return predict_df


# {'min_Lead': 7, 'min_Position': 5, 'min_Evidence': 15, 'min_Claim': 3, 'min_Concluding': 6, 
# 'min_Counterclaim': 6, 'min_Rebuttal': 7, 'proba_Lead': 0.5525631492348035, 
# 'proba_Position': 0.5115443109231739, 'proba_Evidence': 0.5950038120901334, 
# 'proba_Claim': 0.5412112109901169, 'proba_Concluding': 0.6327829537768497, 
# 'proba_Counterclaim': 0.5032089227738017, 'proba_Rebuttal': 0.5479240354533439}
def do_threshold(submit_df, use=['length','probability']):
    df = submit_df.copy()
    df = df.fillna('')
    min_thresh = {
        "Lead": 8, # 9
        "Position": 4, # 5
        "Evidence": 8, # 14
        "Claim": 1, # 3
        "Concluding Statement": 10, # 11
        "Counterclaim": 9,
        "Rebuttal": 2, # 4
    }
    proba_thresh = {
        "Lead": 0.5647964444385352,
        "Position": 0.6211823905704472,
        "Evidence": 0.6037415312070282,
        "Claim": 0.5655015619409717,
        "Concluding Statement": 0.5605916604200145,
        "Counterclaim": 0.5589241228663976,
        "Rebuttal": 0.6279143972926252
    }
    if 'length' in use:
        df['l'] = df.predictionstring.apply(lambda x: len(x.split()))
        for key, value in min_thresh.items():
            #value=3
            index = df.loc[df['class'] == key].query('l<%d'%value).index
            df.drop(index, inplace=True)

    if 'probability' in use:
        df['s'] = df.score.apply(lambda x: np.mean(eval(x)))
        for key, value in proba_thresh.items():
            index = df.loc[df['class'] == key].query('s<%f'%value).index
            df.drop(index, inplace=True)

    df = df[['id', 'class', 'predictionstring']]
    return df

In [None]:
class Config(object):
    train_path = "../input/feedback-prize-2021/sample_submission.csv"
    fix_length = 2048
    num_labels = 15
    

In [None]:
# funnel_large: 18, deberta_v3_large: 62, bigbird_roberta_large: 0, deberta_large: 97, longformer_large_4096: 35
model_list = [
    # deberta_xlarge
    ([
        ("../input/fb-dxlarge/dx01.bin", 91/6),
        ("../input/fb-dxlarge/dx20.bin", 91/6),
        ("../input/fb-dxlarge/dx10_5fold.bin", 91/6),
        ("../input/fb-dxlarge/dx20_fold5.bin", 91/6),
        ("../input/fb-dxlarge/dx30_5fold.bin", 91/6),
        ("../input/fb-dxlarge/dx40_5fold.bin", 91/6),
#         ("../input/fb-dxlarge/dx30.bin", 78),
    ], "../input/deberta-xlarge"),
    # funnel
    ([
        ("../input/fb-model/f03.bin", 27/2),
        ("../input/fb-model/f10.bin", 27/2),
    ], "../input/funneltransformerlarge"),
#     deberta_v3
    ([
        ("../input/fb-model/d10.bin", 39/2),
        ("../input/fb-model/d04.bin", 39/2),
#         ("../input/fb-model/d50.bin", 62/5),
#         ("../input/fb-model/d70.bin", 62/5),
#         ("../input/fb-model/d90.bin", 62/5),
    ], "../input/deberta-v3-large/deberta-v3-large"),
#     longformer
    ([
        ("../input/tez-fb-large/model_0.bin", 55/6.),
        ("../input/tez-fb-large/model_1.bin", 55/6.),
        ("../input/tez-fb-large/model_2.bin", 55/6.),
        ('../input/fblongformerlarge1536/model_3.bin', 55/6.),
        ('../input/fblongformerlarge1536/model_4.bin', 55/6.),
        ("../input/fb-model/l00.bin", 55/6.),
    ], "../input/longformerlarge4096/longformer-large-4096"),
#     deberta
    ([
        ("../input/fb-model/dl00.bin", 45/2),
        ("../input/fb-model/dl20.bin", 45/2),
#         ("../input/fb-model/dl01.bin", 97/7),
#         ("../input/fb-model/dl21.bin", 97/7),
#         ("../input/fb-model/dl40.bin", 97/7),
#         ("../input/fb-model/dl60.bin", 97/7),
#         ("../input/fb-model/dl80.bin", 97/7),
    ], "../input/deberta/large"),
]



# model_list = [
#     ("../input/fb-model/f03.bin", "../input/funneltransformerlarge", 18),    # 0.6794
#     ("../input/fb-model/f10.bin", "../input/funneltransformerlarge", 18),
#     ("../input/fb-model/d10.bin", "../input/deberta-v3-large/deberta-v3-large", 62),
#     ("../input/fb-model/d04.bin", "../input/deberta-v3-large/deberta-v3-large", 62),
#     ("../input/tez-fb-large/model_0.bin", "../input/longformerlarge4096/longformer-large-4096", 35),
#     ("../input/tez-fb-large/model_1.bin", "../input/longformerlarge4096/longformer-large-4096", 35),
#     ("../input/fb-model/l00.bin", "../input/longformerlarge4096/longformer-large-4096", 35),
#     ("../input/fb-model/dl00.bin", "../input/deberta/large", 97),
#     ("../input/fb-model/dl20.bin", "../input/deberta/large", 97),
#     ("../input/fb-model/dl01.bin", "../input/deberta/large", 97),
#     ("../input/fb-model/dl21.bin", "../input/deberta/large", 97),
#     ("../input/fb-model/dl40.bin", "../input/deberta/large", 97),
#     ("../input/fb-model/dl60.bin", "../input/deberta/large", 97),
#     ("../input/fb-model/dl80.bin", "../input/deberta/large", 97),
# ]

# model_list = [
#     ("../input/fb-model/f03.bin", "../input/funneltransformerlarge", 1),    # 0.6794
#     ("../input/fb-model/f10.bin", "../input/funneltransformerlarge", 1),
#     ("../input/fb-model/d10.bin", "../input/deberta-v3-large/deberta-v3-large", 1),
#     ("../input/fb-model/d04.bin", "../input/deberta-v3-large/deberta-v3-large", 1),
#     ("../input/tez-fb-large/model_0.bin", "../input/longformerlarge4096/longformer-large-4096", 1),
#     ("../input/tez-fb-large/model_1.bin", "../input/longformerlarge4096/longformer-large-4096", 1),
#     ("../input/fb-model/l00.bin", "../input/longformerlarge4096/longformer-large-4096", 1),
#     ("../input/fb-model/dl00.bin", "../input/deberta/large", 1),
#     ("../input/fb-model/dl20.bin", "../input/deberta/large", 1),
# ]

In [None]:
# model_list = [
#     ("../input/fb-model/f03.bin", "../input/funneltransformerlarge", 18),    # 0.6794
#     ("../input/fb-model/d04.bin", "../input/deberta-v3-large/deberta-v3-large", 62),
#     ("../input/fb-model/l00.bin", "../input/longformerlarge4096/longformer-large-4096", 35),
#     ("../input/fb-model/dl00.bin", "../input/deberta/large", 97),
#     ("../input/fb-model/b00.bin","../input/bigbirdrobertalarge/bigbird-roberta-large", 35),
# ]

In [None]:
args = Config()
num_net = len(model_list)
df = pd.read_csv(args.train_path)
valid_id = df["id"].unique()
# num_valid = len(valid_id)
# print(f"num_valid: {num_valid}")
df_text = []
for id in valid_id:
    text_file = "../input/feedback-prize-2021/test" + f"/{id}.txt"
    with open(text_file, "r") as f:
        text = f.read()
    df_text.append((id, text))
df_text = pd.DataFrame(df_text, columns=["id", "text"])
df_text['text_len'] = df_text['text'].apply(lambda x: len(x))
df_text = df_text.sort_values('text_len').reset_index(drop=True)
df = df_text
valid_id = df["id"].unique()
num_valid = len(valid_id)
print(f"num_valid: {num_valid}")
results = []
weights = []
weight_sum = 0.0
magic_params = [1.0151040677346772, 0.9030902168873051, 1.150794726450591, 1.1071525118845413, 0.8923895084086448, 0.9156323986648087, 0.8376600826469922, 1.0863350013753394, 0.8034240506162517, 0.8838929237772057, 1.167908517815809, 1.0758516272886205, 0.921439773646722, 1.175718027875091, 1.073194555851536]
magic_params = np.array(magic_params)
convert_table = {
    'Concluding Statement': {'Lead': 0.6180546139059481, 'Position': 1.106376532299193, 'Evidence': 0.8308079139611104, 'Claim': 0.9263323897452017, 'Concluding Statement': 0.7427996729640948, 'Counterclaim': 1.1255410558775625, 'Rebuttal': 0.9443065711964579},
    # 'Evidence': {'Lead': 0.6935170533407621, 'Position': 1.1604061898464963, 'Evidence': 1.053929298774972, 'Claim': 1.0154215894307475, 'Concluding Statement': 1.101286190449023, 'Counterclaim': 0.9458791358573785, 'Rebuttal': 1.0100877841954223},
    'Evidence': {'Lead': 0.7393941341170962, 'Position': 1.0724463092198406, 'Evidence': 1.0058653210627613, 'Claim': 1.022313944728398, 'Concluding Statement': 1.052367501549448, 'Counterclaim': 1.0090474774696865, 'Rebuttal': 1.0333100430656466},
    'Rebuttal': {'Lead': 0.5399304201316384, 'Position': 1.08856196786258, 'Evidence': 1.0103291548166256, 'Claim': 1.0847765835179881, 'Concluding Statement': 0.9549480752091022, 'Counterclaim': 1.092666136606834, 'Rebuttal': 0.7110649743074405},
    'Counterclaim': {'Lead': 0.9050794118735646, 'Position': 0.9160850157706609, 'Evidence': 1.1418506967909068, 'Claim': 1.049570707730134, 'Concluding Statement': 1.1146750148380493, 'Counterclaim': 0.9746304751663061, 'Rebuttal': 1.1594205982055437},
    'Position': {'Lead': 0.8393628836469293, 'Position': 0.792198366631391, 'Evidence': 1.0359996942895344, 'Claim': 1.0163778318063577, 'Concluding Statement': 1.0834964446969553, 'Counterclaim': 1.1085107194393697, 'Rebuttal': 0.6460859598497333},
}
for idx, model in enumerate(model_list):
#     weights.append(model[2])
#     weight_sum += model[2]
    if "deberta-v3" in model[1]:
        tokenizer = get_DebertaV2TokenizerFast(model[1])
    else:
        tokenizer = AutoTokenizer.from_pretrained(model[1])
    valid_samples = prepare_test_data_v2(df, tokenizer)
    valid_datasets = FeedbackDatasetValid(valid_samples, args.fix_length, tokenizer)
    collate = Collate(tokenizer, fix_length=args.fix_length)
    preds_list = []
    for idxx, model_path in enumerate(model[0]):
        weights.append(model_path[1])
        weight_sum += model_path[1]
        predicter = Predicter(args)
        predicter.set_pretrain(model[1])
        predicter.model_init()
        predicter.model_load(model_path[0])
        pred = predicter.predict(valid_datasets, collate)
        pred = np.concatenate(pred)
        np.save(f"tmp_{idx}_{idxx}.npy", pred)
        preds_list.append(f"tmp_{idx}_{idxx}.npy")
        del pred
        del predicter.model
        del predicter
        gc.collect()
        torch.cuda.empty_cache()
    results.append({
        "probability": preds_list,
        "token_offset": [sample["offset_mapping"].copy() for sample in valid_samples],
    })
    del valid_samples
    del valid_datasets
    del tokenizer
    del collate
    gc.collect()
submit_df = []
for i in range(len(results)):
    results[i]["probability"] = [np.load(item) for item in results[i]["probability"]]
time_list = [0,0,0,0,0]
for i in tqdm(range(num_valid)):
    d = df_text.iloc[i]
    id =  d.id
    text = d.text
    word, word_offset = text_to_word(text)
    token_to_text_probability = np.full((len(text),args.num_labels),0, np.float32)
    cnt = 0
    for j in range(num_net):
        for k in range(len(results[j]["probability"])):
            p = results[j]['probability'][k][i][1:]/255
            # logging.info(p.shape)
            for t, (start, end) in enumerate(results[j]["token_offset"][i]):
                if t==args.fix_length-1: break
                token_to_text_probability[start: end] += p[t] * weights[cnt]
#             print(weights[cnt])
            cnt += 1
    token_to_text_probability = token_to_text_probability / weight_sum

    text_to_word_probability = np.full((len(word),args.num_labels),0, np.float32)
    for t,(start,end) in enumerate(word_offset):
        text_to_word_probability[t]=token_to_text_probability[start:end].mean(0)
#     predict_df = word_probability_to_predict_df(text_to_word_probability, id)
#     predict_df = word_probability_to_prediction_string(text_to_word_probability, id, word)
    text_to_word_probability = text_to_word_probability * magic_params
#     predict_df = word_probability_to_prediction_string_v4(text_to_word_probability, id, word)
    predict_df = word_probability_to_prediction_string_v5(text_to_word_probability, id, word, convert_table)
    submit_df.append(predict_df)
#     del token_to_text_probability, text_to_word_probability
#     gc.collect()
submit_df = pd.concat(submit_df).reset_index(drop=True)
submit_df = do_threshold(submit_df, use=['length', 'probability'])
submit_df.to_csv("submission.csv", index=False)
# f1 = Utils.score_feedback_comp_micro

In [None]:
submit_df.head()