# About this notebook
- Deberta-base starter code
- pip wheels is [here](https://www.kaggle.com/yasufuminakama/nbme-pip-wheels)
- Training notebook is [here](https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train)

If this notebook is helpful, feel free to upvote :)

# CFG

In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/debertav23fasttokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=1
    path="../input/xlarge-5foldpsudo-4000/"
    tokenizer_path="../input/xlarge-5foldpsudo-4000/"
    config_path='../input/xlarge-5foldpsudo-4000/config.pth'
    model="../input/deberta-xlarge/"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
class CFG1:
    num_workers=1
    path="../input/large-5foldpsudo-v3/"
    config_path='../input/large-5foldpsudo-v3/config.pth'
    model="../input/debertalarge/"
    batch_size=30
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    
class CFG2:
    num_workers=1
    path="../input/v3-large-5fold-psudo-4000/"
    config_path='../input/v3-large-5fold-psudo-4000/config.pth'
    model="../input/deberta-v3-large/deberta-v3-large"
    batch_size=64
    fc_dropout=0.2
    max_len=364
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    
class CFG3:
    num_workers=1
    path="../input/v2-xlarge-psudo-v2-400/"
    config_path='../input/v2-xlarge-psudo-v2-400/config.pth'
    model="../input/deberta-v2-xlarge"
    batch_size=30
    fc_dropout=0.2
    max_len=358
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    
class CFG4:
    num_workers=1
    path="../input/d/datasets/xiamaozi11/v2-xxlarge/"
    config_path='../input/d/datasets/xiamaozi11/v2-xxlarge/config.pth'
    model="../input/deberta-v2-xxlarge"
    batch_size=10
    fc_dropout=0.2
    max_len=358
    seed=42
    n_fold=4
    trn_fold=[0, 2]

class CFG5:
    num_workers=1
    path="../input/v3-base-pretrained/"
    config_path='../input/v3-base-pretrained/config.pth'
    model="../input/deberta-v3-base/deberta-v3-base"
    batch_size=64
    fc_dropout=0.2
    max_len=364
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]

In [None]:
# # The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# # This must be done before importing transformers
# import shutil
# from pathlib import Path

# transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

# input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

# convert_file = input_dir / "convert_slow_tokenizer.py"
# conversion_path = transformers_path/convert_file.name

# if conversion_path.exists():
#     conversion_path.unlink()

# shutil.copy(convert_file, transformers_path)
# deberta_v2_path = transformers_path / "models" / "deberta_v2"

# for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
#     filepath = deberta_v2_path/filename
#     if filepath.exists():
#         filepath.unlink()

#     shutil.copy(input_dir/filename, filepath)
# from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast


# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
import tokenizers
import transformers
# print(f"tokenizers.__version__: {tokenizers.__version__}")
# print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# tokenizer

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained("../input/xlarge-pretrained-15epoch")
CFG.tokenizer = tokenizer

tokenizer1 = AutoTokenizer.from_pretrained("../input/large-5foldpsudo-v3")
CFG1.tokenizer = tokenizer1

from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

tokenizer2 = DebertaV2TokenizerFast.from_pretrained('../input/v3-large-psudo-2000')
CFG2.tokenizer = tokenizer2

tokenizer3 = DebertaV2TokenizerFast.from_pretrained('../input/v2-xlarge-psudo-v2-400')
CFG3.tokenizer = tokenizer3

tokenizer4 = DebertaV2TokenizerFast.from_pretrained('../input/d/datasets/xiamaozi11/v2-xxlarge')
CFG4.tokenizer = tokenizer4

tokenizer5 = DebertaV2TokenizerFast.from_pretrained('../input/v3-base-pretrained')
CFG5.tokenizer = tokenizer5

# Helper functions for scoring

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results
def convert_offsets_to_word_indices(preds_offsets,texts, case_nums, feature_nums, th=0.5):
    predicts = []
    for text,preds,case_num, feature_num in zip(texts,preds_offsets, case_nums, feature_nums):
#     text = oof['pn_history'][num]
#         print(text)
#         try:
            encoded_text = tokenizer(text,
                                         add_special_tokens=True,
                                         max_length=CFG.max_len,
                                         padding="max_length",
                                         return_offsets_mapping=True)
            offset_mapping = encoded_text['offset_mapping']
            sep_index = encoded_text["input_ids"].index(tokenizer.sep_token_id)
#             sample_pred_scores = preds
            result=np.zeros(len(preds))
#             results = np.zeros(sep_index)
            
            results = np.zeros(sep_index)
            for idx, (offset, pred) in enumerate(zip(offset_mapping[:sep_index], preds)):
                    start = offset[0]
            #         end = offset_mapping[1]
                    results[idx] = preds[start]
            sample_pred_scores = results
            if str(feature_num)[-1] == '3' and (str(case_num)=='0' or str(case_num)=='3' ):
                result = [1 if s >= 0.54  else 0 for s in results]
            elif    str(feature_num)[-1] == '3' and (str(case_num)=='1' ):
                result = [1 if s >= 0.45 else 0 for s in results]
            elif    str(feature_num)[-1] == '3' and   str(case_num)=='6':
                result = [1 if s >= 0.52 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '503' ):
                result = [1 if s >= 0.49  else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '504' ):
                result = [1 if s >= 0.4  else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '508' ):
                result = [1 if s >= 0.49 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '509' ):
                result = [1 if s >= 0.4 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '510' ):
                result = [1 if s >=0.55 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '511' ):
                result = [1 if s >=0.55 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '512' ):
                result = [1 if s >=0.45 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '513' ):
                result = [1 if s >=0.59 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '514' ):
                result = [1 if s >=0.49 else 0 for s in results]
            elif str(case_num)=='5' and ( str(feature_num) == '516' ):
                result = [1 if s >=0.41 else 0 for s in results]
            elif str(case_num)=='7' and ( str(feature_num) == '702' ):
                result = [1 if s >=0.56 else 0 for s in results]
#             elif str(case_num)=='9':
#                 result = [1 if s >=0.54 else 0 for s in results]
#             elif str(case_num)=='0':
#                 result = [1 if s >=0.4 else 0 for s in results]
#             elif str(case_num)=='3':
#                 result = [1 if s >=0.52 else 0 for s in results]
#             elif str(case_num)=='4':
#                 result = [1 if s >=0.52 else 0 for s in results]
#             elif str(case_num)=='7':
#                 result = [1 if s >=0.55 else 0 for s in results]
            else:

                result = [1 if s >= 0.47  else 0 for s in results]

#             result = [1 if s >= 0.47  else 0 for s in results]
            preds = result
            
            sample_text = text
            sample_input_ids = encoded_text["input_ids"]

            sample_preds = []

            if len(preds) < len(offset_mapping):
                preds = preds + [0] * (len(offset_mapping) - len(preds))
                sample_pred_scores = list(sample_pred_scores) + [0] * (len(offset_mapping) - len(sample_pred_scores))

            idx = 0
            phrase_preds = []
            while idx < sep_index:
                start, _ = offset_mapping[idx]
            #     print(start,idx)
                if preds[idx] != 0:
                    label = preds[idx]
                else:
                    label = 0
                phrase_scores = []
                phrase_scores.append(sample_pred_scores[idx])
            #     idx += 1
                while idx < sep_index:
                    if label == 0:
                        matching_label = 0
                    else:
                        matching_label = 1
                    if preds[idx] == matching_label:
                        _, end = offset_mapping[idx]
                        phrase_scores.append(sample_pred_scores[idx])
                        idx += 1
                    else:
                        break
                if "end" in locals():
                    phrase = sample_text[start:end]
                    phrase_preds.append((phrase, start, end, label, phrase_scores))
            temp = []
            for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
                newlabel = ''
                word_start = len(sample_text[:start].split())
                word_end = word_start + len(sample_text[start:end].split())
                word_end = min(word_end, len(sample_text.split()))
                ps = " ".join([str(x) for x in range(word_start, word_end)])

                if label != 0:
                    if sum(phrase_scores) / len(phrase_scores) >= th and word_end !=0:
                        if start!=0 and sample_text[start]==' ':
                            start+=1

                        temp.append(np.array([start,end]))
            if len(temp)==0:
                temp = [list(g) for _, g in itertools.groupby(temp, key=lambda n, c=itertools.count(): n - next(c))]
                temp = [f"{min(r)} {max(r)}" for r in temp]
            #                 print(temp)
            else:
            #                 print(temp)
            #                 print(type(temp[0][0]))
                temp = [f"{min(r)} {max(r)}" for r in temp]
            #                 print(temp)
            predict = ";".join(temp)
            predicts.append(predict)
    return predicts

def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# OOF

In [None]:
# ====================================================
# oof
# ====================================================
# oof = pd.read_pickle(CFG.tokenizer_path+'/oof_df.pkl')

# truths = create_labels_for_scoring(oof)
# char_probs = get_char_probs(oof['pn_history'].values,
#                             oof[[i for i in range(CFG.max_len)]].values, 
#                             CFG.tokenizer)
best_th = 0.5
best_score = 0.
# for th in np.arange(0.45, 0.55, 0.01):
#     th = np.round(th, 2)
#     results = get_results(char_probs, th=th)
#     preds = get_predictions(results)
#     score = get_score(preds, truths)
#     if best_score < score:
#         best_th = th
#         best_score = score
#     LOGGER.info(f"th: {th}  score: {score:.5f}")
# LOGGER.info(f"best_th: {best_th}  score: {best_score:.5f}")

In [None]:
# oof = pd.read_pickle('../input/nnme-deberta-large/deberta-largeoof_df.pkl')

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
test = pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')
submission = pd.read_csv('../input/nbme-score-clinical-patient-notes/sample_submission.csv')
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"test.shape: {test.shape}")
display(test.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

In [None]:
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(test.head())

# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
# def prepare_input(cfg, text, feature_text):
#     inputs = cfg.tokenizer(text, feature_text, 
#                            add_special_tokens=True,
#                            max_length=cfg.max_len,
#                            padding="max_length",
#                            return_offsets_mapping=False)
#     for k, v in inputs.items():
#         inputs[k] = torch.tensor(v, dtype=torch.long)
#     return inputs


# class TestDataset(Dataset):
#     def __init__(self, cfg, df):
#         self.cfg = cfg
#         self.feature_texts = df['feature_text'].values
#         self.pn_historys = df['pn_history'].values

#     def __len__(self):
#         return len(self.feature_texts)

#     def __getitem__(self, item):
#         inputs = prepare_input(self.cfg, 
#                                self.pn_historys[item], 
#                                self.feature_texts[item])
#         return inputs

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text, batch_max_len):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=batch_max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.batch_max_len = df['batch_max_length'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item],
                               self.batch_max_len[item],
                              )
        return inputs

# Model

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
class CustomModel1(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(self.config.hidden_size, 1)

        LSTM_SIZE = 1024
        self.Lstm = nn.LSTM(self.config.hidden_size * 3, LSTM_SIZE // 2, bidirectional=True, batch_first=True) # , num_layers=2

        self._init_weights(self.output)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
#         feature = self.feature(inputs)
#         output = self.fc(self.fc_dropout(feature))
        
        outputs = self.model(**inputs)
        concat_output = outputs.hidden_states[-1]
        # print('transformer_out: ', len(transformer_out[2]))
#         concat_output = torch.cat((hidden_states[-1], hidden_states[-2], hidden_states[-3]), 2)
        concat_output = self.dropout(concat_output)
#         self.Lstm.flatten_parameters()
#         lstm_output, _ = self.Lstm(concat_output)
#         concat_output = torch.cat((concat_output, lstm_output), 2)

        logits1 = self.output(self.dropout1(concat_output))
        logits2 = self.output(self.dropout2(concat_output))
        logits3 = self.output(self.dropout3(concat_output))
        logits4 = self.output(self.dropout4(concat_output))
        logits5 = self.output(self.dropout5(concat_output))

        output = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        
        
        return output

# inference

In [None]:
# ====================================================
# inference
# ====================================================
# def inference_fn(test_loader, model, device):
#     preds = []
#     model.eval()
#     model.to(device)
#     tk0 = tqdm(test_loader, total=len(test_loader))
#     for inputs in tk0:
#         for k, v in inputs.items():
#             inputs[k] = v.to(device)
#         with torch.no_grad():
#             y_preds = model(inputs)
#         preds.append(y_preds.sigmoid().to('cpu').numpy())
#     predictions = np.concatenate(preds)
#     return predictions

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device,cfg):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
    # for inputs in test_loader:
        bs = len(inputs['input_ids'])
        pred_w_pad = np.zeros((bs, cfg.max_len, 1))
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        y_preds = y_preds.sigmoid().to('cpu').numpy()
        pred_w_pad[:, :y_preds.shape[1]] = y_preds
        preds.append(pred_w_pad)
    predictions = np.concatenate(preds)
    return predictions

In [None]:
###### Reduce Padding Inference ######

# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG4.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG4.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
test_dataset = TestDataset(CFG4, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG4.batch_size,
                         shuffle=False,
                         num_workers=CFG4.num_workers, pin_memory=True, drop_last=False)
predictions_v2_xxlarge = []
# predictions1_v2 = []
for fold in CFG4.trn_fold:
    model = CustomModel1(CFG4, config_path=CFG4.config_path, pretrained=False)
    
    
    state = torch.load(CFG4.path+f"user_data-deberta-v2-xxlarge-pretrained-20epoch_fold{fold}_best.pth")
       
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, CFG4)
    prediction = prediction.reshape((len(test), CFG4.max_len))
    prediction = prediction[np.argsort(length_sorted_idx)]
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG4.tokenizer)
    predictions_v2_xxlarge.append(char_probs)
#     predictions1_v3.append(prediction)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
    
predictions_v2_xxlarge = np.mean(predictions_v2_xxlarge, axis=0)

In [None]:
###### Reduce Padding Inference ######

# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length
# test = sort_df

In [None]:
test_dataset = TestDataset(CFG, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions_xlarge = []
# predictions1_xlarge = []
for fold in CFG.trn_fold:
    model = CustomModel1(CFG, config_path=CFG.config_path, pretrained=False)
    
    
    state = torch.load(CFG.path+f"user_data-deberta-xlarge-pretrained-15epoch_fold{fold}_best.pth")
       
       
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device,CFG)
    prediction = prediction.reshape((len(test), CFG.max_len))
    prediction = prediction[np.argsort(length_sorted_idx)]
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions_xlarge.append(char_probs)
#     predictions1_xlarge.append(prediction)
    del model, state, prediction, char_probs
    gc.collect()
#     torch.cuda.empty_cache()
    
predictions_xlarge = np.mean(predictions_xlarge, axis=0)
# predictions1_xlarge = np.mean(predictions1_xlarge, axis=0)

In [None]:

# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG1.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG1.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
test_dataset = TestDataset(CFG1, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG1.batch_size,
                         shuffle=False,
                         num_workers=CFG1.num_workers, pin_memory=True, drop_last=False)
predictions_large = []
predictions1_large = []
for fold in CFG1.trn_fold:
    model = CustomModel1(CFG1, config_path=CFG1.config_path, pretrained=False)
    
    state = torch.load(CFG1.path+f"user_data-deberta-large-pretrained_fold{fold}_best.pth")
       
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device,CFG1)
    prediction = prediction.reshape((len(test), CFG1.max_len))
    prediction = prediction[np.argsort(length_sorted_idx)]
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG1.tokenizer)
    predictions_large.append(char_probs)
#     predictions1_large.append(prediction)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
    
predictions_large = np.mean(predictions_large, axis=0)
# predictions1_large = np.mean(predictions1_large, axis=0)

In [None]:
# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG2.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG2.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
test_dataset = TestDataset(CFG2, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG2.batch_size,
                         shuffle=False,
                         num_workers=CFG2.num_workers, pin_memory=True, drop_last=False)
predictions_v3 = []
predictions1_v3 = []
for fold in CFG2.trn_fold:
    model = CustomModel1(CFG2, config_path=CFG2.config_path, pretrained=False)
    
    state = torch.load(CFG2.path+f"deberta-v3-large-pretrained_fold{fold}_best.pth")
       
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, CFG2)
    prediction = prediction.reshape((len(test), CFG2.max_len))
    prediction = prediction[np.argsort(length_sorted_idx)]
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG2.tokenizer)
    predictions_v3.append(char_probs)
#     predictions1_v3.append(prediction)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
    
predictions_v3 = np.mean(predictions_v3, axis=0)
# predictions1_v3 = np.mean(predictions1_v3, axis=0)

In [None]:
# sort by token num
# input_lengths = []
# tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
# for text, feature_text in tk0:
#     length = len(CFG5.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
#     input_lengths.append(length)
# test['input_lengths'] = input_lengths
# length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# # sort dataframe
# sort_df = test.iloc[length_sorted_idx]

# # calc max_len per batch
# sorted_input_length = sort_df['input_lengths'].values
# batch_max_length = np.zeros_like(sorted_input_length)
# bs = CFG5.batch_size
# for i in range((len(sorted_input_length)//bs)+1):
#     batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
# sort_df['batch_max_length'] = batch_max_length
# test_dataset = TestDataset(CFG5, sort_df)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG5.batch_size,
#                          shuffle=False,
#                          num_workers=CFG5.num_workers, pin_memory=True, drop_last=False)
# predictions_v3_base = []
# # predictions1_v3 = []
# for fold in CFG5.trn_fold:
#     model = CustomModel1(CFG5, config_path=CFG5.config_path, pretrained=False)
    
#     state = torch.load(CFG5.path+f"user_data-deberta-v3-base-pretrained_fold{fold}_best.pth")
       
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model, device, CFG5)
#     prediction = prediction.reshape((len(test), CFG5.max_len))
#     prediction = prediction[np.argsort(length_sorted_idx)]
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG5.tokenizer)
#     predictions_v3_base.append(char_probs)
# #     predictions1_v3.append(prediction)
#     del model, state, prediction, char_probs
#     gc.collect()
#     torch.cuda.empty_cache()
    
# predictions_v3_base = np.mean(predictions_v3_base, axis=0)

In [None]:
# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG3.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG3.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
test_dataset = TestDataset(CFG3, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG3.batch_size,
                         shuffle=False,
                         num_workers=CFG3.num_workers, pin_memory=True, drop_last=False)
predictions_v2 = []
# predictions1_v2 = []
for fold in CFG3.trn_fold:
    model = CustomModel1(CFG3, config_path=CFG3.config_path, pretrained=False)
    
        
    state = torch.load(CFG3.path+f"user_data-deberta-v2-xlarge-pretrained-20epoch_fold{fold}_best.pth")
       
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, CFG3)
    prediction = prediction.reshape((len(test), CFG3.max_len))
    prediction = prediction[np.argsort(length_sorted_idx)]
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG3.tokenizer)
    predictions_v2.append(char_probs)
#     predictions1_v3.append(prediction)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
    
predictions_v2 = np.mean(predictions_v2, axis=0)
# predictions1_v3 = np.mean(predictions1_v3, axis=0)

In [None]:
predictions = predictions_large
for i in range(len(predictions)):
#     predictions[i] = (predictions_xlarge[i]+ predictions_large[i]+ predictions_v3[i])/3
        predictions[i] = (predictions_large[i]+ predictions_xlarge[i] + predictions_v3[i]  + predictions_v2[i] + predictions_v2_xxlarge[i])/5

In [None]:
def count_n_space(text):
    count = 0
    cont = True
    
    if text[0]==' ':
        count+=1
    return count



def get_results(char_probs, texts,th=0.5):
    results = []
    for idx,char_prob in enumerate(char_probs):
        result = np.where(char_prob >= th)[0] #+ 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        new_res = []
        text = texts[idx]
        for r in result:
            start = min(r)
            end = max(r)
            word_start = len(text[:start].split())
            
            sample_text =  text[start:end+1]
            n_space = count_n_space(sample_text)
            start = start+n_space
            new_res.append(f"{start} {end+1}")   
        result = new_res        
        result = ";".join(result)
        results.append(result)
    return results

# Submission

In [None]:
results = convert_offsets_to_word_indices(predictions, test['pn_history'].values, test['case_num'].values,test['feature_num'].values,th=0.48)
# results = get_results(predictions,test['pn_history'].values,th=0.48)
temp = []
for pred in results: 
    if len(pred)>0 and pred[0:2] == '1 ': 
        pred = '0' + pred[1:]
    temp.append(pred)
results = temp

submission['location'] = results
display(submission.head())
submission[['id', 'location']].to_csv('submission.csv', index=False)