In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
# general
import pandas as pd
import numpy as np
import os
import ast
import copy
import random
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import KFold
import gc
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")
# nlp
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_cosine_schedule_with_warmup
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

In [None]:
class Config:
    savename = "deberta-large"
    seed = 2022
    n_folds = 5
    num_workers = 2
    fold = 0
    model = "microsoft/deberta-large"
    lr = 2.5e-5
    n_accum = 1
    output = "/content/model"
    input = "/content/data/"
    ner_csv = "/content/train_NER.csv"
    max_len = 512
    stride = 128
    num_labels = 15
    batch_size = 4
    valid_batch_size = 4
    epochs = 6
    accumulation_steps = 1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    apex = True
    debug = False
    if debug:
        n_folds = 2
        epochs = 2

In [None]:
output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

LABELS_TO_IDS = {v:k for k,v in enumerate(output_labels)}
LABELS_TO_IDS["PAD"] = -100
IDS_TO_LABELS = {k:v for k,v in enumerate(output_labels)}
IDS_TO_LABELS[-100] = "PAD"

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(Config.seed)

In [None]:
def load_df_test():
    test_names, df_test = [], []
    for f in list(os.listdir('../input/feedback-prize-2021/test')):
        test_names.append(f.replace('.txt', ''))
        df_test.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
    df_test = pd.DataFrame({'id': test_names, 'text': df_test})
    df_test['text_split'] = df_test.text.str.split()
    return df_test

In [None]:
# def load_df_test():
#     test_names, df_test = [], []
#     for f in list(os.listdir('../input/feedback-prize-2021/train')):
#         test_names.append(f.replace('.txt', ''))
#         df_test.append(open('../input/feedback-prize-2021/train/' + f, 'r').read())
#     df_test = pd.DataFrame({'id': test_names[:10000], 'text': df_test[:10000]})
#     df_test['text_split'] = df_test.text.str.split()
#     return df_test

In [None]:
def tokenize(df, max_len, tokenizer, to_tensor=True, with_labels=False):
    
    # This is what's different from a longformer
    # Read the parameters with attention
    encoded = tokenizer(df['text_split'].tolist(),
                        is_split_into_words=True,
                        return_overflowing_tokens=True,
                        stride=128,
                        max_length=max_len,
                        padding="max_length",
                        truncation=True)

    if with_labels:
        encoded['labels'] = []

    encoded['wids'] = []
    n = len(encoded['overflow_to_sample_mapping'])
    for i in range(n):

        # Map back to original row
        text_idx = encoded['overflow_to_sample_mapping'][i]
        
        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)
        
        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = df['entities'].iloc[text_idx]
        
            # Get the labels associated with the word indexes
            label_ids = get_labels(word_ids, word_labels)
            encoded['labels'].append(label_ids)
        encoded['wids'].append([w if w is not None else -1 for w in word_ids])
    
    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}
    return encoded

In [None]:
class FeedbackDatasetTest(Dataset):
    def __init__(self, tokenized_ds):
        self.data = tokenized_ds

    def __getitem__(self, index):
        item = {k: self.data[k][index] for k in self.data.keys()}
        return item

    def __len__(self):
        return len(self.data['input_ids'])

In [None]:
class FeedbackModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels

        hidden_dropout_prob: float = 0.2
        layer_norm_eps: float = 17589e-7

        config = AutoConfig.from_pretrained(model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": self.num_labels,
            }
        )
        self.transformer = AutoModel.from_config(config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask, token_type_ids=None):

        if token_type_ids:
            transformer_out = self.transformer(ids, mask, token_type_ids)
        else:
            transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        sequence_output = self.dropout(sequence_output)

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))

        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits, logits1, logits2, logits3, logits4, logits5

In [None]:
def get_predictions(all_labels, all_scores, df):    
    proba_thresh = {
        "Lead": 0.7,
        "Position": 0.55,
        "Evidence": 0.65,
        "Claim": 0.55,
        "Concluding Statement": 0.7,
        "Counterclaim": 0.5,
        "Rebuttal": 0.55,
    }
    final_preds = []
    
    for i in range(len(df)):
        idx = df.id.values[i]
        pred = all_labels[i]
        score = all_scores[i]
        preds = []
        j = 0
        
        while j < len(pred):
            cls = pred[j]
            if cls == 'O': pass
            else: cls = cls.replace('B','I')
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            if cls != 'O' and cls != '' and end - j > 7:
                if np.mean(score[j:end]) > proba_thresh[cls.replace('I-','')]:
                    final_preds.append((idx, cls.replace('I-',''), 
                                        ' '.join(map(str, list(range(j, end))))))
            j = end
    df_pred = pd.DataFrame(final_preds)
    df_pred.columns = ['id','class','predictionstring']
    return df_pred

def threshold(df):

    min_thresh = {
        "Lead": 9,
        "Position": 5,
        "Evidence": 14,
        "Claim": 3,
        "Concluding Statement": 11,
        "Counterclaim": 6,
        "Rebuttal": 4,
    }

    df = df.copy()
    for key, value in min_thresh.items():
        index = df.loc[df["class"] == key].query(f"len<{value}").index
        df.drop(index, inplace=True)
    return df

In [None]:
@torch.no_grad()
def inference(model, weight, test_loader):
    model.load_state_dict(torch.load(weight))
    model.eval()
    test_pbar = tqdm(enumerate(test_loader), total = len(test_loader))
    for step, data in test_pbar:
        input_ids = data["input_ids"].to(Config.device)
        input_mask = data["attention_mask"].to(Config.device)

        batch_size = input_ids.shape[0]
        logits, logits1, logits2, logits3, logits4, logits5 = model(input_ids,
                                                                        input_mask)

        val_preds = logits.detach().cpu().numpy()
        
        yield val_preds    

In [None]:
def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    for thresh2 in range(26,27, 1):
        retval = []
        for idv in idu:
            for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
                   'Counterclaim', 'Rebuttal']:
                q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
                if len(q) == 0:
                    continue
                pst = []
                for i,r in q.iterrows():
                    pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
                start = 1
                end = 1
                for i in range(2,len(pst)):
                    cur = pst[i]
                    end = i
                    #if pst[start] == 205:
                    #   print(cur, pst[start], cur - pst[start])
                    if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                        retval.append((idv, c, jn(pst, start, end)))
                        start = i + 1
                v = (idv, c, jn(pst, start, end+1))
                #print(v)
                retval.append(v)
        roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
        roof = roof.merge(neoof, how='outer')
        return roof

In [None]:
def postprocess(test_preds, test_loader, test_df):
    predictions = defaultdict(list)
    prediction_scores = defaultdict(list)
    seen_words_idx = defaultdict(list)
    test_pbar = tqdm(enumerate(test_loader), total = len(test_loader))
    for step, data in test_pbar:
        pred = test_preds[step]
        batch_preds = np.argmax(pred, axis=-1)
        batch_scores = np.max(pred, axis=-1)
        for k, (chunk_preds, chunk_scores, text_id) in enumerate(zip(batch_preds, batch_scores, data["overflow_to_sample_mapping"].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = data['wids'][k].numpy()

            # Map from ids to labels
            chunk_preds = [IDS_TO_LABELS[i] for i in chunk_preds]        

            for idx, word_idx in enumerate(word_ids):                            
                if word_idx == -1:
                    pass
                elif word_idx not in seen_words_idx[text_id]:
                    # Add predictions if the word doesn't have a prediction from a previous chunk
                    predictions[text_id].append(chunk_preds[idx])
                    prediction_scores[text_id].append(chunk_scores[idx])
                    seen_words_idx[text_id].append(word_idx)
                    
        final_predictions = [predictions[k] for k in sorted(predictions.keys())]
        final_scores = [prediction_scores[k] for k in sorted(prediction_scores.keys())]
        
    df_pred = get_predictions(final_predictions, final_scores, test_df)
    df_pred["len"] = df_pred["predictionstring"].apply(lambda x: len(x.split()))
    print(df_pred["len"])
    df_pred = threshold(df_pred)
    df_pred = link_evidence(df_pred)
    df_pred[["id", "class", "predictionstring"]].to_csv("submission.csv", index = None)

In [None]:
model_dict = dict(
    deberta_v3_l = dict(
        model_name = "../input/fb-deberta/deberta-v3-large/deberta-v3-large",
        weights = [f"../input/fb-deberta/FB_debertav3-large_chunks/models/model_{fold}" for fold in range(5)],
        max_len = 512,
        config_name = "../input/fb-deberta/deberta-v3-large/deberta-v3-large/config.json"
    ),
#     deberta_l = dict(
#         model_name = "../input/fb-deberta/deberta-large/deberta-large",
#         weights = [f"../input/fb-deberta/FB_deberta-large_chunks/models/model_{fold}" for fold in range(5)],
#         max_len = 512,
#         config_name = "../input/fb-deberta/deberta-large/deberta-large/config.json"
#     )
)

In [None]:
if __name__ == "__main__":
    test_df = load_df_test()
    counter = 0
    raw_preds = []
    for key, item in model_dict.items():
        print(f"Predicting {key}")
        tokenizer = DebertaV2TokenizerFast.from_pretrained(item["model_name"], add_prefix_space=True)
        tokenized_test = tokenize(test_df, max_len = item["max_len"], tokenizer = tokenizer)
        test_dataset = FeedbackDatasetTest(tokenized_test)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                    batch_size = 8,
                                                    num_workers = 2,  
                                                    shuffle = False)
        model = FeedbackModel(item["model_name"], 15)
        model.to(Config.device)
        for weight in item["weights"]:
            test_preds = inference(model, weight, test_loader)
            for idx,pred in enumerate(test_preds):
                pred = pred.astype(np.float16) / 5
                if counter == 0:
                    raw_preds.append(pred)
                else:
                    raw_preds[idx] += pred
            counter += 1
        del model, tokenizer, test_dataset, test_preds
        gc.collect()
        print(len(raw_preds))
    postprocess(raw_preds, test_loader, test_df)

In [None]:
pd.read_csv("submission.csv").head()