# points for contrivance

To converge xlarge, I used [this](https://github.com/antmachineintelligence/Feedback_1st/blob/a8d201a7d91e967f3df508434f7969ca9f59d0d6/utils/models.py#L23) as a reference.<br>
I used [this](https://www.kaggle.com/code/sergeichudov/8th-place-inference-notebook/notebook?scriptVersionId=90185474) as a reference and tried to be fashionable in learning v3large.(This isn't effective for v1large...)


In [None]:
w1 = 0.5 #xlarge-mnli
w2 = 0.45 #v3 large
w3 = 0.02 #0.1 so far #v1 large 
w4 = 0.03

In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
import os
import gc
import ast
import sys
import copy
import json
import math
import string
import pickle
import random
import itertools
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(seed=42)

In [None]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
        
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
        
    return micro_f1(bin_preds, bin_truths)

def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
        
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
            
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
        
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
        
    return predictions


def get_score(y_true, y_pred):
    return span_micro_f1(y_true, y_pred)

In [None]:
main_dir="../input/nbme-score-clinical-patient-notes/"

def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features


test = pd.read_csv(main_dir+'test.csv')
submission = pd.read_csv(main_dir+'sample_submission.csv')
features = pd.read_csv(main_dir+'features.csv')
patient_notes = pd.read_csv(main_dir+'patient_notes.csv')

features = preprocess_features(features)

print(f"test.shape: {test.shape}")
display(test.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

In [None]:
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(test.head())

# xlarge mnli

In [None]:
class CFG:
    num_workers=4
    path="/kaggle/input/deberta-xlarge-mnli/"
    config_path=path+'config.pth'
    model="microsoft/deberta-xlarge-mnli"
    batch_size=24
    fc_dropout=0.2
    max_len=466################
    seed=42
    n_fold=3
    trn_fold=[0, 1, 2, 3, 4]#[0,3,4]

In [None]:
from transformers.models.deberta.tokenization_deberta_fast import DebertaTokenizerFast

tokenizer = DebertaTokenizerFast.from_pretrained('/kaggle/input/deberta-xlarge-mnli/tokenizer')
CFG.tokenizer = tokenizer

In [None]:
# this is conventional
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        
        return inputs

In [None]:
# ====================================================
# Dataset Faster
# ====================================================
def prepare_input_fast(cfg, text, feature_text, batch_max_len):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=batch_max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDatasetFast(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.batch_max_len = df['batch_max_length'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input_fast(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item],
                               self.batch_max_len[item],
                              )
        return inputs

In [None]:
class ScoringModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:12].requires_grad_(False)
        else:
            self.model = AutoModel.from_config(self.config)
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:12].requires_grad_(False)
            
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        
        return output

In [None]:
# this is conventional

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    
    return predictions

In [None]:
# # this is conventional

# test_dataset = TestDataset(CFG, test)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG.batch_size,
#                          shuffle=False,
#                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     model = ScoringModel(CFG, config_path=CFG.config_path, pretrained=False)
    
#     state = torch.load(CFG.path+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
#                            map_location=torch.device('cpu'))
       
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs
#     gc.collect()
#     torch.cuda.empty_cache()
    
# predictions_xlarge_mnli = np.mean(predictions, axis=0)

In [None]:
def inference_fn_fast(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
    # for inputs in test_loader:
        bs = len(inputs['input_ids'])
        pred_w_pad = np.zeros((bs, CFG.max_len, 1))
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        y_preds = y_preds.sigmoid().to('cpu').numpy()
        pred_w_pad[:, :y_preds.shape[1]] = y_preds
        preds.append(pred_w_pad)
    predictions = np.concatenate(preds)
    return predictions

In [None]:
###### Reduce Padding Inference ######

# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
# this is faster ver

test_dataset = TestDatasetFast(CFG, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ScoringModel(CFG, config_path=CFG.config_path, pretrained=False)
    
    state = torch.load(CFG.path+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
       
    model.load_state_dict(state['model'])
    prediction = inference_fn_fast(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    ## data re-sort ##
    prediction = prediction[np.argsort(length_sorted_idx)]
    ## data re-sort ##
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
    
predictions_xlarge_mnli = np.mean(predictions, axis=0)

# deberta v3 large

In [None]:
!ls /kaggle/input/v3large-lastlastlast/

In [None]:
class CFG:
    num_workers=4
    path="/kaggle/input/rouhi-original/"
    config_path=path+'config.pth'
    path2="/kaggle/input/v3large-conv1d-fold0-2/"
    config_path2=path2+'config.pth'
#     path3 = '/kaggle/input/v3large-lastlastlast/'
    path3 = '/kaggle/input/v3-large-fold7-lastlastlast/'
    config_path3 = path3 + 'config.pth'
    path4 = '/kaggle/input/v3large-lastlastlast/'
    config_path4 = path4+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=24
    fc_dropout=0.2
    max_len=354##########
    seed=42
    n_fold=5
    trn_fold=[0,1,2,3,4]
    trn_fold2 = [0,1,2,3,4,5,6]

In [None]:
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

CFG.tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.path+'tokenizer/')
CFG.tokenizer2 = DebertaV2TokenizerFast.from_pretrained(CFG.path3+'tokenizer/')
CFG.tokenizer3 = DebertaV2TokenizerFast.from_pretrained(CFG.path4+'tokenizer/')


In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
class CustomModel_V3LargeConv1d(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        self.conv1d_layer1 = nn.Conv1d(1024, 1024, kernel_size = 1)
        self.conv1d_layer3 = nn.Conv1d(1024, 1024, kernel_size = 3, padding = 1)
        self.conv1d_layer5 = nn.Conv1d(1024, 1024, kernel_size = 5, padding = 2)
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # self.config = AutoConfig.from_pretrained('/home/rtakasu/NBME/rouhi/debertav3large_ITPT_5_testtest', output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            self.model.pooler = None
            # self.model.embeddings.requires_grad_(False)
            # self.model.encoder.layer[:12].requires_grad_(False)
        else:
            self.model = AutoModel.from_config(self.config)
#             self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            self.model.pooler = None

        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        # self.fc = nn.Linear(self.config.hidden_size, 1)
        self.fc = nn.Sequential(
            nn.LayerNorm(self.config.hidden_size*3),
            nn.Linear(self.config.hidden_size*3, 1) #hidden_size = 1024
            # nn.GELU(),
            # nn.Linear(256, 1)
        )
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)##torch.Size([4, 354, 1024]) #<class 'torch.Tensor'>
        # print('a'*30)
        conv_input = feature.transpose(1,2)##torch.Size([4, 1024, 354])
        # print('b'*30)
        conv_output1 = F.relu(self.conv1d_layer1(conv_input))
        # print('c'*30)
        # print(conv_output1.shape)#torch.Size([4, 1024, 354])
        conv_output3 = F.relu(self.conv1d_layer3(conv_input))
        # print('d'*30)
        # print(conv_output3.shape)#torch.Size([4, 1024, 354])
        conv_output5 = F.relu(self.conv1d_layer5(conv_input))
        # print('e'*30)
        # print(conv_output5.shape)#torch.Size([4, 1024, 354])

        concat_output = torch.cat((conv_output1, conv_output3, conv_output5), dim = 1).transpose(1,2)
        # print('f'*30)
        # print(concat_output.shape)#torch.Size([4, 354, 3072])

        concat_DR = self.fc_dropout(concat_output)
        # print('g'*30)
        # print(ddd.shape)#torch.Size([4, 354, 1024])
        output = self.fc(concat_DR)
        # print('h'*30)

        return output

In [None]:
###### Reduce Padding Inference ######

# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
# # this is conventional

# test_dataset = TestDataset(CFG, test)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG.batch_size,
#                          shuffle=False,
#                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    
#     state = torch.load(CFG.path+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
#                            map_location=torch.device('cpu'))
       
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs
#     gc.collect()
#     torch.cuda.empty_cache()

# for fold in CFG.trn_fold2:
#     model = CustomModel_V3LargeConv1d(CFG, config_path=CFG.config_path2, pretrained=False)
    
#     state = torch.load(CFG.path2+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
#                            map_location=torch.device('cpu'))
       
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs
#     gc.collect()
#     torch.cuda.empty_cache()
    
# predictions_v3_large_and_conv1d = np.mean(predictions, axis=0)

In [None]:
# this is faster

# test_dataset = TestDatasetFast(CFG, sort_df)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG.batch_size,
#                          shuffle=False,
#                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     if fold == 1 or fold == 3 or fold == 4:
#         model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
#         state = torch.load(CFG.path+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
#                                map_location=torch.device('cpu'))
#         model.load_state_dict(state['model'])
#     elif fold == 0 or fold == 2:
#         model = CustomModel_V3LargeConv1d(CFG, config_path=CFG.config_path2, pretrained=False)
#         state = torch.load(CFG.path2+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
#                            map_location=torch.device('cpu'))
#         model.load_state_dict(state['model'])
        
#     prediction = inference_fn_fast(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
#     ## data re-sort ##
#     prediction = prediction[np.argsort(length_sorted_idx)]
#     ## data re-sort ##
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs
#     gc.collect()
#     torch.cuda.empty_cache()
    
    
for fold in CFG.trn_fold2:
    model = CustomModel(CFG, config_path=CFG.config_path3, pretrained=False)
    state = torch.load(CFG.path3+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
        
    prediction = inference_fn_fast(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    ## data re-sort ##
    prediction = prediction[np.argsort(length_sorted_idx)]
    ## data re-sort ##
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer2)
    predictions.append(char_probs)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
    
    
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path4, pretrained=False)
    state = torch.load(CFG.path4+f"microsoft-{CFG.model.split('/')[1]}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
        
    prediction = inference_fn_fast(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    ## data re-sort ##
    prediction = prediction[np.argsort(length_sorted_idx)]
    ## data re-sort ##
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer3)
    predictions.append(char_probs)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()

    
predictions_v3_large_and_conv1d = np.mean(predictions, axis=0)

# deberta-large-fold7 (GELU or not)

In [None]:
class CustomModelNormal(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False, GELU = None):
        super().__init__()
        self.cfg = cfg
        self.GELU = GELU
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # self.config = AutoConfig.from_pretrained('/home/rtakasu/NBME/rouhi/debertav3large_ITPT_5_testtest', output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            # self.model.embeddings.requires_grad_(False)
            # self.model.encoder.layer[:12].requires_grad_(False)
        else:
            self.model = AutoModel.from_config(self.config)
            
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        
        if self.GELU:
            self.fc = nn.Sequential(
                # nn.LayerNorm(self.config.hidden_size),
                nn.Linear(self.config.hidden_size, 256), #hidden_size = 1024
                nn.GELU(),
                nn.Linear(256, 1)
            )
        else:
            self.fc = nn.Linear(self.config.hidden_size, 1)
            
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))

        return output

In [None]:
class CFG:
    num_workers=4
    path0="/kaggle/input/deberta-large-series/deberta_large_series/deberta-large-fold7/"
    config_path0=path0+'config.pth'
    model0="microsoft/deberta-large"
    path1="/kaggle/input/deberta-large-series/deberta_large_series/deberta-large-fold7_GELU/"
    config_path1=path1+'config.pth'
    model1="microsoft/deberta-large"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=3
    trn_fold=[0, 3, 4] #[0, 3, 4]

In [None]:
CFG.tokenizer = DebertaTokenizerFast.from_pretrained(CFG.path0+'tokenizer/')

In [None]:
###### Reduce Padding Inference ######

# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
test_dataset = TestDatasetFast(CFG, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
# models_ = []
for fold in CFG.trn_fold:
    if fold == 0:
        model = CustomModelNormal(CFG, config_path=CFG.config_path0, pretrained=False, GELU = False)
        state = torch.load(CFG.path0+f"{CFG.model0.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
    elif fold == 3 or fold == 4:
        model = CustomModelNormal(CFG, config_path=CFG.config_path1, pretrained=False, GELU = True)
        state = torch.load(CFG.path1+f"{CFG.model1.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        
    prediction = inference_fn_fast(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    ## data re-sort ## 
    prediction = prediction[np.argsort(length_sorted_idx)]
    ## data re-sort ## 
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del model, state, prediction, char_probs; gc.collect()
    torch.cuda.empty_cache()
predictions_deberta_large_fold7_gelu_or_not = np.mean(predictions, axis=0)

# deberta large mnli GELU fold7

In [None]:
!ls /kaggle/input/deberta-large-mnli-normal-fold1-2/

In [None]:
class CFG:
    num_workers=4
    path="/kaggle/input/deberta-large-series/deberta_large_series/deberta-large_mnli_fold7_GELU/"
    config_path=path+'config.pth'
    path2 = '/kaggle/input/deberta-large-mnli-normal-fold1-2/'
    config_path2 = path2 + 'config.pth'
    model="microsoft/deberta-large-mnli"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=1#3
    trn_fold=[1,2,5] #[0, 3, 4]


In [None]:
CFG.tokenizer = DebertaTokenizerFast.from_pretrained(CFG.path+'tokenizer/')

In [None]:
###### Reduce Padding Inference ######

# sort by token num
input_lengths = []
tk0 = tqdm(zip(test['pn_history'].fillna("").values, test['feature_text'].fillna("").values), total=len(test))
for text, feature_text in tk0:
    length = len(CFG.tokenizer(text, feature_text, add_special_tokens=True)['input_ids'])
    input_lengths.append(length)
test['input_lengths'] = input_lengths
length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])

# sort dataframe
sort_df = test.iloc[length_sorted_idx]

# calc max_len per batch
sorted_input_length = sort_df['input_lengths'].values
batch_max_length = np.zeros_like(sorted_input_length)
bs = CFG.batch_size
for i in range((len(sorted_input_length)//bs)+1):
    batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
sort_df['batch_max_length'] = batch_max_length

In [None]:
test_dataset = TestDatasetFast(CFG, sort_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
# models_ = []
for fold in CFG.trn_fold:
    if fold == 5:
        model = CustomModelNormal(CFG, config_path=CFG.config_path, pretrained=False, GELU = True)
        state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
    elif fold == 1 or fold == 2:
        model = CustomModelNormal(CFG, config_path=CFG.config_path2, pretrained=False, GELU = False)
        state = torch.load(CFG.path2+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        
    prediction = inference_fn_fast(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    ## data re-sort ## 
    prediction = prediction[np.argsort(length_sorted_idx)]
    ## data re-sort ## 
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del model, state, prediction, char_probs; gc.collect()
    torch.cuda.empty_cache()
predictions_deberta_large_mnli_fold7_gelu_or_not = np.mean(predictions, axis=0)

# ensemble

In [None]:
predictions = []
for p1, p2, p3, p4 in zip(predictions_xlarge_mnli, predictions_v3_large_and_conv1d, predictions_deberta_large_fold7_gelu_or_not, predictions_deberta_large_mnli_fold7_gelu_or_not):#
    predictions.append(w1*p1 + w2*p2 + w3*p3 + w4*p4)

In [None]:
results = get_results(predictions)
submission['location'] = results
display(submission.head())
submission[['id', 'location']].to_csv('submission.csv', index=False)