In [None]:
ensemble_divide_val = 16

c1 = 1.25
c2 = 1.5
c3 = 1
c4 = 0.5
c5 = 1
c6 = 0.5
c7 = 1.25
c8 = 1
c9 = 1
c10 = 1
c11 = 1
c12 = 1.25
c13 = 1.25
c14 = 1.25
c15 = 1.25

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")


import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset tokenizers')
import tokenizers
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

#os.environ["WANDB_DISABLED"] = "true"

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")
display(test.head())
display(submission.head())

# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

# Model

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Inference

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device, use_sigmoid=False):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        
        if use_sigmoid:
            preds.append(y_preds.sigmoid().to('cpu').numpy())
        else:
            preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

# Model 1

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-mse-7-out-of-10-folds/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.1
    target_size=1
    max_len=133
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7]

In [None]:
# ====================================================
# tokenizer
# ====================================================
# CFG.tokenizer = AutoTokenizer.from_pretrained('../input/pppm-deberta-v3-large-baseline-w-w-b-train/tokenizer/')
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())

test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions1 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
#     state = torch.load(CFG.path+f"deberta-v3-large_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions1.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions1 = np.mean(predictions1, axis=0)

# Model 2

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-10-out-of-20-folds/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.1
    target_size=1
    max_len=133
    seed=42
    n_fold=20
    trn_fold=[0, 2, 3, 8, 10, 11, 12, 13, 16, 18]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
# test['text'] = '[CLS]' + test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions2 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=True)
    predictions2.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions2 = np.mean(predictions2, axis=0)

# Model 3

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-bert-for-patents-mse-4-out-of-5-folds/"
    config_path=path+'config.pth'
    model="anferico/bert-for-patents"
    batch_size=32
    fc_dropout=0.1
    target_size=1
    max_len=133
    seed=42
    n_fold=5
    trn_fold=[0, 1, 3, 4]


In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')
# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = '[CLS]' + test['anchor'] + '[cpc]' + test['context_text'] + '[SEP]'  + test['target'] + '[SEP]'
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions3 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions3.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions3 = np.mean(predictions3, axis=0)

# Model 4

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-coco-lm-large-mse-exp-1/"
    config_path=path+'config.pth'
    model="microsoft/cocolm-large"  # ['microsoft/deberta-v3-large', 'anferico/bert-for-patents']
    batch_size=32
    fc_dropout=0.1
    target_size=1
    max_len=190
    seed=42
    trn_fold=[0, 1, 2, 3, 4]

In [None]:
sys.path.insert(1, '../input/cocolm/huggingface/')
 
from cocolm.modeling_cocolm import COCOLMModel, COCOLMPreTrainedModel
from cocolm.configuration_cocolm import COCOLMConfig
from cocolm.tokenization_cocolm import COCOLMTokenizer

# ====================================================
# Dataset
# ====================================================
def prepare_input_CoCoLM(cfg, anchor, target, context_text):
    # inputs = cfg.tokenizer.encode_plus(text,
    #                        add_special_tokens=True,
    #                        max_length=cfg.max_len,
    #                        padding="max_length",
    #                        return_offsets_mapping=False)

    tokenizer = cfg.tokenizer
    _anchor = tokenizer.encode(anchor, add_special_tokens=False)
    _target = tokenizer.encode(target, add_special_tokens=False)
    _context_text = tokenizer.encode(context_text, add_special_tokens=False)

    token_ids = [tokenizer.cls_token_id] + _anchor + [tokenizer.sep_token_id] + _target + [tokenizer.sep_token_id] + _context_text + [tokenizer.sep_token_id]
    inputs = {'input_ids': token_ids}

    for k, v in inputs.items():
        if len(v) < cfg.max_len:
            num_of_paddings = cfg.max_len - len(v)
            paddings = [cfg.tokenizer.pad_token_id for n in range(num_of_paddings)]
            v = v + paddings
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


# '[CLS]' + train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text'] + '[SEP]'
class CoCoLMTestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.anchor = df['anchor'].values
        self.target = df['target'].values
        self.context_text = df['context_text'].values

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        inputs = prepare_input_CoCoLM(self.cfg, self.anchor[item], self.target[item], self.context_text[item])
        return inputs


# ====================================================
# Model
# ====================================================

# set model path
COCOLMModel.supported_convert_pretrained_model_archive_map['cocolm']['microsoft/cocolm-large'] = '../input/cocolmlargeweight/pytorch_model.bin'
COCOLMPreTrainedModel.supported_convert_pretrained_model_archive_map['cocolm']['microsoft/cocolm-large'] = '../input/cocolmlargeweight/pytorch_model.bin'


class CustomCoCoLMModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        self.config = COCOLMConfig.from_pretrained("../input/cocolmlargeweight/")
        self.model = COCOLMModel.from_pretrained("microsoft/cocolm-large", config=self.config, local_files_only=True)
        
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = COCOLMTokenizer.from_pretrained('../input/coco-lm-large/tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())


# ====================================================
# Dataset
# ====================================================
test_dataset = CoCoLMTestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions4 = []
for fold in CFG.trn_fold:
    model = CustomCoCoLMModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions4.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions4 = np.mean(predictions4, axis=0)

# Model 5

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-bert-for-patents-12-out-of-20-folds-mse/"
    config_path=path+'config.pth'
    model="anferico/bert-for-patents"  # ['microsoft/deberta-v3-large', 'anferico/bert-for-patents']
    batch_size=32
    fc_dropout=0.1
    target_size=1
    max_len=133
    seed=42
    trn_fold=[0, 1, 2, 4, 5, 8, 10, 11, 12, 13, 14, 16]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = '[CLS]' + test['anchor'] + '[cpc]' + test['context_text'] + '[SEP]'  + test['target'] + '[SEP]'
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions5 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions5.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions5 = np.mean(predictions5, axis=0)

# Model 6

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-bert-for-patents-10-out-of-25-folds/"
    config_path=path+'config.pth'
    model="anferico/bert-for-patents"  # ['microsoft/deberta-v3-large', 'anferico/bert-for-patents']
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=133
    seed=42
    trn_fold=[0, 3, 6, 7, 8, 10, 14, 16, 22, 23]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = '[CLS]' + test['anchor'] + '[cpc]' + test['context_text'] + '[SEP]'  + test['target'] + '[SEP]'
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions6 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions6.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions6 = np.mean(predictions6, axis=0)

# Model 7

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-cocolm-large-exp-2/"
    config_path=path+'config.pth'
    model="microsoft/cocolm-large"  # ['microsoft/deberta-v3-large', 'anferico/bert-for-patents']
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=190
    seed=42
    trn_fold=[0, 1, 2, 4, 6, 10, 12, 13, 18, 19]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = COCOLMTokenizer.from_pretrained('../input/coco-lm-large/tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())


# ====================================================
# Dataset
# ====================================================
test_dataset = CoCoLMTestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions7 = []
for fold in CFG.trn_fold:
    model = CustomCoCoLMModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions7.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions7 = np.mean(predictions7, axis=0)

# Model 8

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-roberta-large-exp-2/"
    config_path=path+'config.pth'
    model="roberta-large"
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=188
    seed=42
    trn_fold=[0, 1, 2, 3, 4, 5, 6]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions8 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions8.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions8 = np.mean(predictions8, axis=0)

# Model 9

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-roberta-large-exp-1/"
    config_path=path+'config.pth'
    model="roberta-large"
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=188
    seed=42
    trn_fold=[0, 1, 2, 3, 4]

# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

In [None]:
predictions9 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions9.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions9 = np.mean(predictions9, axis=0)

# Model 10

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-roberta-large-8-out-of-15-folds/"
    config_path=path+'config.pth'
    model="roberta-large"
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=188
    seed=42
    trn_fold=[1, 2, 3, 5, 8, 10, 12, 13]

# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

In [None]:
predictions10 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions10.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions10 = np.mean(predictions10, axis=0)

# Model 11

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-bert-for-patents-5-fold-test/"
    config_path=path+'config.pth'
    model="anferico/bert-for-patents"
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=130
    seed=42
    trn_fold=[0, 1, 2, 3, 4]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = '[CLS]' + test['anchor'] + '[cpc]' + test['context_text'] + '[SEP]'  + test['target'] + '[SEP]'
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions11 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=False)
    predictions11.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions11 = np.mean(predictions11, axis=0)

# Model 12

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-albert-xxl-v2/"
    config_path=path+'config.pth'
    model="albert-xxlarge-v2"
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=133
    seed=42
    trn_fold=[0, 1 ,2, 3, 4]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions12 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=True)
    predictions12.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions12 = np.mean(predictions12, axis=0)

# Model 13

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-base/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

In [None]:
# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions13 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=True)
    predictions13.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions13 = np.mean(predictions13, axis=0)

# Model 14

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-deberta-v3-mse-exp-3/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=133
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = '[CLS]' + test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions14 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=True)
    predictions14.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions14 = np.mean(predictions14, axis=0)

# Model 15

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppmdebertav2xl/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v2-xlarge"
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    trn_fold=[0, 1 ,2, 3, 4]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path)

# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

# ====================================================
# Dataset
# ====================================================
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
predictions15 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, use_sigmoid=True)
    predictions15.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions15 = np.mean(predictions15, axis=0)

# Submission

In [None]:
from sklearn.preprocessing import MinMaxScaler

MMscaler = MinMaxScaler()

pred2_mm = MMscaler.fit_transform(predictions2.reshape(-1,1)).reshape(-1) * c2
pred1_mm = MMscaler.fit_transform(predictions1.reshape(-1,1)).reshape(-1) * c1

pred3_mm = MMscaler.fit_transform(predictions3.reshape(-1,1)).reshape(-1) * c3
pred4_mm = MMscaler.fit_transform(predictions4.reshape(-1,1)).reshape(-1) * c4
pred5_mm = MMscaler.fit_transform(predictions5.reshape(-1,1)).reshape(-1) * c5
pred6_mm = MMscaler.fit_transform(predictions6.reshape(-1,1)).reshape(-1) * c6
pred7_mm = MMscaler.fit_transform(predictions7.reshape(-1,1)).reshape(-1) * c7
pred8_mm = MMscaler.fit_transform(predictions8.reshape(-1,1)).reshape(-1) * c8
pred9_mm = MMscaler.fit_transform(predictions9.reshape(-1,1)).reshape(-1) * c9
pred10_mm = MMscaler.fit_transform(predictions10.reshape(-1,1)).reshape(-1) * c10
pred11_mm = MMscaler.fit_transform(predictions11.reshape(-1,1)).reshape(-1) * c11
pred12_mm = MMscaler.fit_transform(predictions12.reshape(-1,1)).reshape(-1) * c12
pred13_mm = MMscaler.fit_transform(predictions13.reshape(-1,1)).reshape(-1) * c13
pred14_mm = MMscaler.fit_transform(predictions14.reshape(-1,1)).reshape(-1) * c14
pred15_mm = MMscaler.fit_transform(predictions15.reshape(-1,1)).reshape(-1) * c15

In [None]:
predictions = pred1_mm + pred2_mm + pred3_mm  + pred4_mm  + pred5_mm  + pred6_mm + pred7_mm  + pred8_mm  + pred9_mm  + pred10_mm + pred11_mm  + pred12_mm + pred13_mm + pred14_mm + pred15_mm
predictions = predictions / ensemble_divide_val

In [None]:
# predictions = []
# for p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12 in zip(
#     pred1_mm, pred2_mm, pred3_mm, pred4_mm, pred5_mm, 
#     pred6_mm, pred7_mm, pred8_mm, pred9_mm, pred10_mm,
#     pred11_mm, pred12_mm):
    
#     # calculate results
#     result1 = p1 * c1 + p2 * c2 + p3 * c3 + p4 * c4 + p5 * c5
#     result2 = p6 * c6 + p7 * c7 + p8 * c8 + p9 * c9 + p10 * c10
#     result3 = p11 * c11 + p12 * c12
#     predictions.append((result1 + result2 + result3) / ensemble_divide_val)

In [None]:
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
submission['score'] = predictions
display(submission.head())
submission[['id', 'score']].to_csv('submission.csv', index=False)