In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
from joblib import Parallel, delayed
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}


id_target_map = {v: k for k, v in target_id_map.items()}

In [None]:
class FeedbackDatasetTest:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }

In [None]:
class FeedbackModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)

        hidden_dropout_prob: float = 0.2
        layer_norm_eps: float = 17589e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits

In [None]:
def _prepare_test_data_helper(args, tokenizer, ids):
    test_samples = []
    for idx in ids:
        filename = os.path.join("../input/feedback-prize-2021", "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(df_ids, tokenizer, args):
    test_samples = []
    ids_splits = np.array_split(df_ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(args, tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples

In [None]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = 2048

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output

In [None]:
def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    for thresh2 in range(26,27, 1):
        retval = []
        for idv in idu:
            for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
                   'Counterclaim', 'Rebuttal']:
                q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
                if len(q) == 0:
                    continue
                pst = []
                for i,r in q.iterrows():
                    pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
                start = 1
                end = 1
                for i in range(2,len(pst)):
                    cur = pst[i]
                    end = i
                    #if pst[start] == 205:
                    #   print(cur, pst[start], cur - pst[start])
                    if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                        retval.append((idv, c, jn(pst, start, end)))
                        start = i + 1
                v = (idv, c, jn(pst, start, end+1))
                #print(v)
                retval.append(v)
        roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
        roof = roof.merge(neoof, how='outer')
        return roof

In [None]:
def get_submission(test_samples):
    proba_thresh = {
        "Lead": 0.687,
        "Position": 0.537,
        "Evidence": 0.637,
        "Claim": 0.537,
        "Concluding Statement": 0.687,
        "Counterclaim": 0.537,
        "Rebuttal": 0.537,
    }

    min_thresh = {
        "Lead": 9,
        "Position": 5,
        "Evidence": 14,
        "Claim": 3,
        "Concluding Statement": 11,
        "Counterclaim": 6,
        "Rebuttal": 4,
    }

    submission = []
    for sample_idx, sample in enumerate(test_samples):
        preds = sample["preds"]
        offset_mapping = sample["offset_mapping"]
        sample_id = sample["id"]
        sample_text = sample["text"]
        sample_input_ids = sample["input_ids"]
        sample_pred_scores = sample["pred_scores"]
        sample_preds = []

        if len(preds) < len(offset_mapping):
            preds = preds + ["O"] * (len(offset_mapping) - len(preds))
            sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))

        idx = 0
        phrase_preds = []
        while idx < len(offset_mapping):
            start, _ = offset_mapping[idx]
            if preds[idx] != "O":
                label = preds[idx][2:]
            else:
                label = "O"
            phrase_scores = []
            phrase_scores.append(sample_pred_scores[idx])
            idx += 1
            while idx < len(offset_mapping):
                if label == "O":
                    matching_label = "O"
                else:
                    matching_label = f"I-{label}"
                if preds[idx] == matching_label:
                    _, end = offset_mapping[idx]
                    phrase_scores.append(sample_pred_scores[idx])
                    idx += 1
                else:
                    break
            if "end" in locals():
                phrase = sample_text[start:end]
                phrase_preds.append((phrase, start, end, label, phrase_scores))

        temp_df = []
        for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
            word_start = len(sample_text[:start].split())
            word_end = word_start + len(sample_text[start:end].split())
            word_end = min(word_end, len(sample_text.split()))
            ps = " ".join([str(x) for x in range(word_start, word_end)])
            if label != "O":
#                 print(label)
#                 print(sum(phrase_scores) / len(phrase_scores))
                if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                    if len(ps.split()) >= min_thresh[label]:
                        temp_df.append((sample_id, label, ps))

        temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])
        submission.append(temp_df)

    submission = pd.concat(submission).reset_index(drop=True)
    submission = link_evidence(submission)
    submission.to_csv("submission.csv", index=False)

In [None]:
@torch.no_grad()
def inference(model, weight, test_loader):
    model.load_state_dict(torch.load(weight))
    model.eval()
    test_pbar = tqdm(enumerate(test_loader), total = len(test_loader))
    test_preds = []
    for step, data in test_pbar:
        input_ids = data["ids"].to(device)
        input_mask = data["mask"].to(device)
        with autocast(enabled = True):
            logits = model(input_ids,
                          input_mask)
        yield logits.cpu().detach().numpy()

In [None]:
model_dict = dict(
#     longformer_base1 = dict(
#         model_name = "../input/fb-longformer/longformer-base/longformer-base",
#         config_name = "../input/fb-longformer/longformer-base/longformer-base/config.json",
#         weights = [f"../input/fb-longformer/FB_longformer-base/models/model_{fold}" for fold in [0, 1, 2]],
#         max_len = 1600
#     ),
#     longformer_base2 = dict(
#         model_name = "../input/fb-longformer/longformer-base/longformer-base",
#         config_name = "../input/fb-longformer/longformer-base/longformer-base/config.json",
#         weights = [f"../input/fb-longformer/FB_longformer-base-squadv2/models/model_{fold}" for fold in [3, 4]],
#         max_len = 1600
#     ),
#     longformer_large1 = dict(
#         model_name = "../input/fb-longformer/longformer-large/longformer-large",
#         config_name = "../input/fb-longformer/longformer-large/longformer-large/config.json",
#         weights = [f"../input/fb-longformer/FB_longformer-large-trivia/models/model_{fold}" for fold in range(3,5)],
#         max_len = 1600
#     ),
#     longformer_large2 = dict(
#         model_name = "../input/fb-longformer/longformer-large/longformer-large",
#         config_name = "../input/fb-longformer/longformer-large/longformer-large/config.json",
#         weights = [f"../input/fb-longformer/FB_longformer-large/models/model_{fold}" for fold in range(3)],
#         max_len = 1600
#     ),
    deberta_large1 = dict(
        model_name = "../input/fb-deberta/deberta-large/deberta-large",
        config_name = "../input/fb-deberta/deberta-large/deberta-large/config.json",
        weights = [f"../input/fb-deberta/FB_deberta-large/models/model_{fold}" for fold in range(1,5)],
        max_len = 1600
    ),
#     deberta_largev3_1 = dict(
#         model_name = "../input/fb-deberta/deberta-v3-large/deberta-v3-large",
#         config_name = "../input/fb-deberta/deberta-v3-large/deberta-v3-large/config.json",
#         weights = [f"../input/fb-deberta/FB_debertav3-large/models/model_{fold}" for fold in range(1,5)],
#         max_len = 1024
#     ),
#     deberta_xlargev_1 = dict(
#         model_name = "../input/fb-deberta/deberta-xlarge/deberta-xlarge",
#         config_name = "../input/fb-deberta/deberta-xlarge/deberta-xlarge/config.json",
#         weights = [f"../input/fb-deberta/FB_deberta-xlarge/models/model_{fold}" for fold in range(3,5)],
#         max_len = 1600
#     ),
#     bigbird_base1 = dict(
#         model_name = "../input/fb-bigbird/bigbird-base/bigbird-base",
#         config_name = "../input/fb-bigbird/bigbird-base/bigbird-base/config.json",
#         weights = [f"../input/fb-bigbird/FB_bigbird-base-trivia/models/model_{fold}" for fold in range(5)],
#         max_len = 4096
#     )
)

In [None]:
if __name__ == "__main__":
    df = pd.read_csv(os.path.join("../input/feedback-prize-2021/", "sample_submission.csv"))
    df_ids = df["id"].unique()
    counter = 0
    raw_preds = []
    for key, item in model_dict.items():
        print(f"Predicting {key}")
        tokenizer = AutoTokenizer.from_pretrained(item["model_name"])
        test_samples = prepare_test_data(df_ids, tokenizer, item)
        collate = Collate(tokenizer=tokenizer)
        test_dataset = FeedbackDatasetTest(test_samples,
                                           item["max_len"],
                                           tokenizer)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                    batch_size = 8,
                                                    collate_fn = collate,
                                                    num_workers = 2,  
                                                    shuffle = False)
        model = FeedbackModel(item["model_name"], 15)
        model.to(device)
        for weight in item["weights"]:
            test_preds = inference(model, weight, test_loader)
            for idx,pred in enumerate(test_preds):
                pred = pred.astype(np.float16) / 4
                if counter == 0:
                    raw_preds.append(pred)
                else:
                    raw_preds[idx] += pred
            counter += 1
        del model, tokenizer, test_dataset, test_loader, test_preds
        gc.collect()
            
    final_preds = []
    final_scores = []
    for rp in raw_preds:
        pred_class = np.argmax(rp, axis=2)
        pred_scrs = np.max(rp, axis=2)
        for pred, pred_scr in zip(pred_class, pred_scrs):
            pred = pred.tolist()
            pred_scr = pred_scr.tolist()
            final_preds.append(pred)
            final_scores.append(pred_scr)

    for j in range(len(test_samples)):
        tt = [id_target_map[p] for p in final_preds[j][1:]]
        tt_score = final_scores[j][1:]
        test_samples[j]["preds"] = tt
        test_samples[j]["pred_scores"] = tt_score
    get_submission(test_samples)


In [None]:
pd.read_csv("submission.csv").head()