In [1]:
import re
import nltk
import wandb, ast
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim.swa_utils as swa
import tokenizers, transformers
import os, sys, gc, time, random, warnings, math, re, ast, pickle, glob

from transformers import AdamW
from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
from functools import reduce
from log import _Logger
from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter

from torch import Tensor, inference_mode
from transformers import AutoTokenizer, AutoModel, AutoConfig
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold
from tqdm.auto import tqdm
from kaggle_secrets import UserSecretsClient
warnings.filterwarnings("ignore")
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [2]:
""" Test Configuration """

# CV: 
class CFG1:
    """ Token Classification Model Single """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/uspppm-token-768-32/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    loss_fn = 'BinaryCrossEntroyLoss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 768
    batch_size = 32
    n_folds = 5
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 4
    reinit = False
    num_reinit = 2
    num_grad_norm = 1000
    weight = 0.8

# CV: 
class CFG2:
    """ Token Classifiication Pipeline 2 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/uspppm-token-pipeline-bs1/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    loss_fn = 'BinaryCrpssEntropy'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 400
    batch_size = 1
    n_folds = 4
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 2
    reinit = True
    num_reinit = 5
    amp_scaler = False
    awp = False
    swa = True
    num_grad_norm = 1000
    weight = 1.0    
    
class CFG3:
    """ Token Classifiication Pipeline 3 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/uspppm-token-classification-sequence768-bs1/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    loss_fn = 'BinaryCrpssEntropy'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 768
    batch_size = 1
    n_folds = 4
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 2
    reinit = True
    num_reinit = 5
    num_grad_norm = 1000
    weight = 0.8
    
class CFG4:
    """ Token Classifiication Pipeline 4 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/uspppm-token-classification-sequence640-bs32/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    loss_fn = 'BinaryCrpssEntropy'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 640
    batch_size = 32
    n_folds = 4
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 4
    reinit = True
    num_reinit = 2
    num_grad_norm = 1000
    weight = 0.8
    
class CFG5:
    """ Token Classifiication Pipeline 5 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/uspppm-token-classification-sequence1024-bs16/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    loss_fn = 'BinaryCrpssEntropy'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 1024
    batch_size = 16
    n_folds = 4
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 2
    reinit = True
    num_reinit = 5
    num_grad_norm = 1000
    weight = 0.8
    
class CFG6:
    """ Sentence Classifiication Pipeline 6 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/pppm-mse-7-out-of-10-folds/microsoft-deberta*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 128
    batch_size = 32
    n_folds = 7
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 2
    reinit = True
    num_reinit = 5
    num_grad_norm = 1000
    weight = 1.2
    
# class CFG7: # 빼야 할듯
#     """ Token Classifiication Pipeline 7 """
#     model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
#     model_list = glob.glob('/kaggle/input/pppm-10-out-of-20-folds/microsoft-deberta*.pth')
#     tokenizer = AutoTokenizer.from_pretrained(model)
#     seed = 42
#     n_gpu = 1
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     gpu_id = 0
#     num_workers = 0
#     max_len = 128
#     batch_size = 32
#     n_folds = 10
#     epoch = 24
#     gradient_checkpoint = True  # save parameter
#     layerwise_lr = 2e-5
#     freeze = True
#     num_freeze = 2
#     reinit = True
#     num_reinit = 5
#     num_grad_norm = 1000
#     weight = 1.0
    
class CFG7:
    """ Token Classifiication Pipeline 8 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/pppm-mse-lr-15e6/microsoft-deberta*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 128
    batch_size = 32
    n_folds = 5
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 2
    reinit = True
    num_reinit = 5
    num_grad_norm = 1000
    weight = 1.2
    
class CFG8:
    """ Token Classifiication Pipeline 9 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/pppm-deberta-v3-mse-exp-3/microsoft-deberta*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 128
    batch_size = 32
    n_folds = 4
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 2
    reinit = True
    num_reinit = 5
    num_grad_norm = 1000
    weight = 1.2
    
class CFG9:
    """ Token Classifiication Pipeline 10 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/pppm-deberta-v3-mse-exp-4/microsoft-deberta*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 128
    batch_size = 32
    n_folds = 4
    epoch = 24
    gradient_checkpoint = True  # save parameter
    layerwise_lr = 2e-5
    freeze = True
    num_freeze = 2
    reinit = True
    num_reinit = 5
    num_grad_norm = 1000
    weight = 1.2
    
# class CFG10:
#     """ Token Classifiication Pipeline 11 """
#     model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
#     model_list = glob.glob('/kaggle/input/pppm-deberta-v3-mse-exp-2/microsoft-deberta*.pth')
#     tokenizer = AutoTokenizer.from_pretrained(model)
#     seed = 42
#     n_gpu = 1
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     gpu_id = 0
#     num_workers = 0
#     max_len = 128
#     batch_size = 32
#     n_folds = 4
#     epoch = 24
#     gradient_checkpoint = True  # save parameter
#     layerwise_lr = 2e-5
#     freeze = True
#     num_freeze = 2
#     reinit = True
#     num_reinit = 5
#     num_grad_norm = 1000
#     weight = 1.0
    
# class CFG12:
#     """ Token Classifiication Pipeline 12 """
#     model = '/kaggle/input/pppm-bert-for-patents-12-out-of-20-folds-mse'
#     model_list = glob.glob('/kaggle/input/pppm-bert-for-patents-12-out-of-20-folds-mse/anferico-bert-for-patents*.pth')
#     tokenizer = AutoTokenizer.from_pretrained(model + '/tokenizer')
#     seed = 42
#     n_gpu = 1
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     gpu_id = 0
#     num_workers = 0
#     max_len = 128
#     batch_size = 32
#     n_folds = 12
#     epoch = 24
#     gradient_checkpoint = True  # save parameter
#     layerwise_lr = 2e-5
#     freeze = True
#     num_freeze = 2
#     reinit = True
#     num_reinit = 5
#     num_grad_norm = 1000
#     weight = 1.0

In [3]:
""" Helper Function """

def check_device() -> bool:
    return torch.mps.is_available()

def check_library(checker: bool) -> tuple:
    """
    1) checker == True
        - current device is mps
    2) checker == False
        - current device is cuda with cudnn
    """
    if not checker:
        _is_built = torch.backends.cudnn.is_available()
        _is_enable = torch.backends.cudnn.enabledtorch.backends.cudnn.enabled
        version = torch.backends.cudnn.version()
        device = (_is_built, _is_enable, version)
        return device

def class2dict(cfg) -> dict:
    return dict((name, getattr(cfg, name)) for name in dir(cfg) if not name.startswith('__'))


def all_type_seed(cfg, checker: bool) -> None:
    # python & torch seed
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)  # python Seed
    random.seed(cfg.seed)  # random module Seed
    np.random.seed(cfg.seed)  # numpy module Seed
    torch.manual_seed(cfg.seed)  # Pytorch CPU Random Seed Maker

    # device == cuda
    if not checker:
        torch.cuda.manual_seed(cfg.seed)  # Pytorch GPU Random Seed Maker
        torch.cuda.manual_seed_all(cfg.seed)  # Pytorch Multi Core GPU Random Seed Maker
        # torch.cudnn seed
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.enabled = False

def seed_worker(worker_id) -> None:
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    

check_library(True)
all_type_seed(CFG1, True)

g = torch.Generator()
g.manual_seed(CFG1.seed)

<torch._C.Generator at 0x731960598b30>

In [4]:
""" Data & Tokenizer Preprocess Function for Token Classification Pipeline """

def text_preprocess(test_path: str, cpc_path: str):
    """ Preprocess for Token Classification """
    test = pd.read_csv(test_path)
    cpc_texts = torch.load(cpc_path)
    test['context_text'] = test['context'].map(cpc_texts)
    anchor_context_grouped_target = test.groupby(['anchor', 'context'])['target'].apply(list)
    anchor_context_grouped_id = test.groupby(['anchor', 'context'])['id'].apply(list)
    i = pd.DataFrame(anchor_context_grouped_id).reset_index()
    t = pd.DataFrame(anchor_context_grouped_target).reset_index()
    test = t.merge(i, on=['anchor', 'context'])
    test['context_text'] = test['context'].map(cpc_texts)
    test = test.rename(columns={'target': 'targets', 'id': 'ids'})
    test['n_ids'] = test['ids'].map(len)
    return test

def sentence_preprocess(test_path: str, sentence_cpc_path: str):
    """ Preprocess for Sentence Classification """
    sentence_test = pd.read_csv(test_path)
    cpc_texts = torch.load(sentence_cpc_path)
    sentence_test['context_text'] = sentence_test['context'].map(cpc_texts)
    return sentence_test

def add_special_token(cfg) -> None:
    """ Add [TAR] Token to pretrained tokenizer """
    tar_token = '[TAR]'
    special_tokens_dict = {'additional_special_tokens': [f'{tar_token}']}
    cfg.tokenizer.add_special_tokens(special_tokens_dict)
    tar_token_id = cfg.tokenizer(f'{tar_token}', add_special_tokens=False)['input_ids'][0]
    setattr(cfg.tokenizer, 'tar_token', f'{tar_token}')
    setattr(cfg.tokenizer, 'tar_token_id', tar_token_id)
#    cfg.tokenizer.save_pretrained(f'{cfg.checkpoint_dir}/tokenizer/')

In [5]:
""" Model Utils Function """

def freeze(module) -> None:
    """
    Freezes module's parameters.

    [Example]
    freezing embeddings and first 2 layers of encoder
    1) freeze(model.embeddings)
    2) freeze(model.encoder.layer[:2])
    """
    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freeze_parameters(module) -> list:
    """
    Returns names of freezed parameters of the given module.

    [Example]
    freezed_parameters = get_freezed_parameters(model)
    """
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def reinit_topk(model, num_layers) -> None:
    """
    Re-initialize the last-k transformer Encoder layers.
    Encoder Layer: Embedding, Attention Head, LayerNorm, Feed Forward
    Args:
        model: The target transformer model.
        num_layers: The number of layers to be re-initialized.
    """
    model.encoder.layer[-num_layers:].apply(model._init_weights)
    
def postprocess(pseudo_label):
    """ for post processing to teacher model's prediction(pseudo label) """
    label_dict = torch.arange(1, 5.5, 0.5)
    pseudo_label.squeeze()
    for instance in pseudo_label:
        for idx in range(len(instance)):
            instance[idx] = label_dict[(torch.abs(label_dict - instance[idx]) == min(torch.abs(label_dict - instance[idx]))).nonzero(as_tuple=False)]
    return pseudo_label

# classification task
def num_classes(loss_name: str) -> int:
    """
    Return Num Classes for each Training Objectives (loss function)
    1) CE
    2) BCE, Pearson
    """
    if loss_name == "CrossEntropyLoss":
        num_class = 5
    else:
        num_class = 1
    return num_class

In [6]:
""" Data Class for Inference Stage """

class TokenDataset(Dataset):
    """ For Token Classification Task class """
    def __init__(self, cfg, df, is_valid=False):
        super().__init__()
        self.anchor_list = df['anchor'].values
        self.target_list = df['targets'].values
        self.context_list = df['context_text'].values
        self.id_list = df['ids'].values
        self.cfg = cfg
        self.tokenizer = cfg.tokenizer
        self.is_valid = is_valid

    def tokenizing(self, text: str) -> dict:
        inputs = self.cfg.tokenizer(
            text,
            max_length=self.cfg.max_len,
            padding='max_length',
            truncation=True,
            return_tensors=None,
            add_special_tokens=False,
        )

        return inputs

    def __len__(self) -> int:
        return len(self.id_list)

    def __getitem__(self, idx: int):
        targets = np.array(self.target_list[idx])
        text = self.cfg.tokenizer.cls_token + self.anchor_list[idx] + self.cfg.tokenizer.sep_token
        for target in targets:
            text += target + self.cfg.tokenizer.tar_token
        text += self.context_list[idx] + self.cfg.tokenizer.sep_token
        
        inputs = self.tokenizing(text)
        if self.cfg.batch_size == 1:
            target_mask = np.zeros(len([token for token in inputs['input_ids'] if token != 0]))
        else:
            target_mask = np.zeros(self.cfg.max_len)
        cnt_tar = 0
        cnt_sep = 0
        nth_target = -1
        prev_i = -1

        for i, input_id in enumerate(inputs['input_ids']):
            if input_id == self.tokenizer.tar_token_id:
                cnt_tar += 1
                if cnt_tar == len(targets):
                    break
            if input_id == self.tokenizer.sep_token_id:
                cnt_sep += 1

            if cnt_sep == 1 and input_id not in [self.tokenizer.pad_token_id, self.tokenizer.sep_token_id, self.tokenizer.tar_token_id]:
                if (i-prev_i) > 1:
                    nth_target += 1
                target_mask[i] = 1
                prev_i = i
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v)
            
        return inputs, target_mask
    
class SentenceDataset(Dataset):
    """ For Sentence Task class """
    def __init__(self, cfg, df, is_valid=False):
        super().__init__()
        self.cfg = cfg
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.is_valid = is_valid

    def tokenizing(self, text: str) -> dict:
        inputs = self.cfg.tokenizer(
            text,
            max_length=self.cfg.max_len,
            padding='max_length',
            truncation=True,
            return_tensors=None,
            add_special_tokens=False,
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v)
        return inputs

    def __len__(self) -> int:
        return len(self.df.id)

    def __getitem__(self, idx: int):
        anchor = self.df.iloc[idx, 1]
        target = self.df.iloc[idx, 2]
        context_text = self.df.iloc[idx, 4]
        
        text = self.cfg.tokenizer.cls_token + anchor + self.cfg.tokenizer.sep_token
        text = text + target + context_text + self.cfg.tokenizer.sep_token
        inputs = self.tokenizing(text)
        
        return inputs 

In [7]:
""" Model Class for Inference Stage """

class TokenModel(nn.Module):
    """ Inference Class For Token Classification Pipeline """
    def __init__(self, cfg, n_vocabs: int):
        super().__init__()
        self.cfg = cfg
        self.auto_cfg = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(cfg.model, config=self.auto_cfg)
        self.model.resize_token_embeddings(n_vocabs)
        self.fc = nn.Linear(self.auto_cfg.hidden_size, num_classes(self.cfg.loss_fn))
 
        if cfg.reinit:
            self._init_weights(self.fc)
            reinit_topk(self.model, cfg.num_reinit)

        if cfg.freeze:
            freeze(self.model.embeddings)
            freeze(self.model.encoder.layer[:cfg.num_freeze])

        if cfg.gradient_checkpoint:
            self.model.gradient_checkpointing_enable()

    def _init_weights(self, module) -> None:
        """ over-ride initializes weights of the given module function (+initializes LayerNorm) """
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            """ reference from torch.nn.Layernorm with elementwise_affine=True """
            module.weight.data.fill_(1.0)
            module.bias.data.zero_()

    def feature(self, inputs: dict):
        outputs = self.model(**inputs)
        return outputs

    def forward(self, inputs: dict) -> Tensor:
        outputs = self.feature(inputs)
        logit = self.fc(outputs.last_hidden_state).squeeze(-1)
        return logit
    
    
class SentenceModel(nn.Module):
    """ Inference Class for Sentence Classification Pipeline """
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [8]:
""" Inference Function for Token Classification """

@inference_mode()
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    
    for inputs, target_masks in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        y_preds = model(inputs)
        y_preds = y_preds.sigmoid().to('cpu').numpy()

        anchorwise_preds = []
        for pred, target_mask, in zip(y_preds, target_masks):
            prev_i = -1
            targetwise_pred_scores = []
            for i, (p, tm) in enumerate(zip(pred, target_mask)):
                if tm != 0:
                    if i-1 == prev_i:
                        targetwise_pred_scores[-1].append(p)
                    else:
                        targetwise_pred_scores.append([p])
                    prev_i = i
            for targetwise_pred_score in targetwise_pred_scores:
                anchorwise_preds.append(np.mean(targetwise_pred_score))
        preds.append(anchorwise_preds)
        
    return preds

@inference_mode()
def sentence_inference_fn(test_loader, model, device, use_sigmoid=False):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        y_preds = model(inputs)
        if use_sigmoid:
            preds.append(y_preds.sigmoid().to('cpu').numpy())
        else:
            preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [9]:
""" Inference for Token Classification """

test = text_preprocess(
    test_path = '/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv',
    cpc_path = '/kaggle/input/uspppm-cpc-text/4_cpc_texts.pth'
)
sentence_test = sentence_preprocess(
    '/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv',
    '/kaggle/input/uspppm-cpc-text/sentence_cpc_texts.pth'
)
sample_submission = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv')

# cfg_list = [CFG1, CFG2, CFG3, CFG4, CFG5, CFG6, CFG7, CFG8, CFG9, CFG10]
cfg_list = [CFG1, CFG2, CFG3, CFG4, CFG5, CFG6, CFG7, CFG8, CFG9]
for _idx, CFG in enumerate(tqdm(cfg_list)):
    submission = sample_submission.copy()
    if _idx < 5:
        """ Token Classification """ 
        add_special_token(CFG)
        tmp_test = test.copy()
        test_dataset = TokenDataset(CFG, tmp_test)

    else:
        """ Sentence Classification """
        tmp_test = sentence_test.copy()
        test_dataset = SentenceDataset(CFG, tmp_test)
        
    test_loader = DataLoader(
        test_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False
    )            
    predictions = []
    
    for fold in tqdm(CFG.model_list):
        fold_result = []
        if _idx < 5:
            """ Token-Level Task """ 
            model = TokenModel(CFG, len(CFG.tokenizer))
            state = torch.load(fold, map_location='cpu')
            model.load_state_dict(state)
            prediction = inference_fn(test_loader, model, CFG.device)
        else:
            """ Sentence-Level Task """
            model = SentenceModel(CFG)
            state = torch.load(fold, map_location='cpu')
            model.load_state_dict(state['model'])
            prediction = sentence_inference_fn(test_loader, model, CFG.device)
        
        for result in prediction:
            fold_result.extend(result)
            
        predictions.append(fold_result)
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
        
    predictions = np.mean(predictions, axis=0)
    submission['score'] = predictions
    display(submission.head())
    submission.to_csv(f'submission_{_idx + 1}.csv', index=False)
    del tmp_test, submission, predictions, test_dataset, test_loader; gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.620893
1,09e418c93a776564,0.016604
2,36baf228038e314b,0.310364
3,1f37ead645e7f0c8,0.24846
4,71a5b6ad068d531f,0.347724


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.633135
1,09e418c93a776564,0.000805
2,36baf228038e314b,0.297816
3,1f37ead645e7f0c8,0.42871
4,71a5b6ad068d531f,0.362711


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.607807
1,09e418c93a776564,0.001256
2,36baf228038e314b,0.269876
3,1f37ead645e7f0c8,0.354386
4,71a5b6ad068d531f,0.328078


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.578566
1,09e418c93a776564,0.029203
2,36baf228038e314b,0.287961
3,1f37ead645e7f0c8,0.218345
4,71a5b6ad068d531f,0.319255


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.618186
1,09e418c93a776564,0.025365
2,36baf228038e314b,0.28852
3,1f37ead645e7f0c8,0.411407
4,71a5b6ad068d531f,0.305606


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.537211
1,09e418c93a776564,0.76141
2,36baf228038e314b,0.469517
3,1f37ead645e7f0c8,0.287857
4,71a5b6ad068d531f,-0.005556


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.571677
1,09e418c93a776564,0.74343
2,36baf228038e314b,0.540604
3,1f37ead645e7f0c8,0.319199
4,71a5b6ad068d531f,0.05509


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.567474
1,09e418c93a776564,0.764823
2,36baf228038e314b,0.534415
3,1f37ead645e7f0c8,0.301714
4,71a5b6ad068d531f,0.066509


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,4112d61851461f60,0.567474
1,09e418c93a776564,0.764823
2,36baf228038e314b,0.534415
3,1f37ead645e7f0c8,0.301714
4,71a5b6ad068d531f,0.066509


In [10]:
""" Ensemble for Sentence Classification """
sub1 = pd.read_csv(f'submission_1.csv')['score'] * CFG1.weight
sub2 = pd.read_csv(f'submission_2.csv')['score'] * CFG2.weight
sub3 = pd.read_csv(f'submission_3.csv')['score'] * CFG3.weight
sub4 = pd.read_csv(f'submission_4.csv')['score'] * CFG4.weight
sub5 = pd.read_csv(f'submission_5.csv')['score'] * CFG5.weight
sub6 = pd.read_csv(f'submission_6.csv')['score'] * CFG6.weight
sub7 = pd.read_csv(f'submission_7.csv')['score'] * CFG7.weight
sub8 = pd.read_csv(f'submission_8.csv')['score'] * CFG8.weight
sub9 = pd.read_csv(f'submission_9.csv')['score'] * CFG9.weight
# sub10 = pd.read_csv(f'submission_10.csv')['score'] * CFG10.weight
# sub11 = pd.read_csv(f'submission_11.csv')['score'] * CFG11.weight
# sub12 = pd.read_csv(f'submission_12.csv')['score'] * CFG12.weight

#ens = (sub1 + sub2 + sub3 + sub4 + sub5 + sub6+ sub7 + sub8 + sub9 + sub10)/(CFG1.weight + CFG2.weight + CFG3.weight + CFG4.weight + CFG5.weight + CFG6.weight + CFG7.weight + CFG8.weight + CFG9.weight + CFG10.weight)
ens = (sub1 + sub2 + sub3 + sub4 + sub5 + sub6+ sub7 + sub8 + sub9)/(CFG1.weight + CFG2.weight + CFG3.weight + CFG4.weight + CFG5.weight + CFG6.weight + CFG7.weight + CFG8.weight + CFG9.weight)


sample_submission['score'] = ens
display(sample_submission.head())
sample_submission.to_csv('submission.csv', index=False)

Unnamed: 0,id,score
0,4112d61851461f60,0.585122
1,09e418c93a776564,0.411125
2,36baf228038e314b,0.413104
3,1f37ead645e7f0c8,0.318597
4,71a5b6ad068d531f,0.180256
