In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('begin')

INPUT_DIR = '/kaggle/input/us-patent-phrase-to-phrase-matching/'
# INPUT_DIR = ''
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
class Config:
    def __init__(self):
        super(Config, self).__init__()
        self.DEBUG = False
        self.SEED = 42
        self.MODEL_TYPE = 'electra_large'
        self.MODEL_PATH = 'google/electra-large-discriminator'
        self.BATCH_SIZE = 8
        self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.LR = 1e-5
        self.N_WARMUP = 0
        self.EPOCHS = 5

CONFIG = Config()

# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score

def get_logger(filename=OUTPUT_DIR + 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=CONFIG.SEED)


# ====================================================
# Data Loading
# ====================================================
if CONFIG.DEBUG:
    train = pd.read_csv(INPUT_DIR+'train.csv', nrows=1000)
else:
    train = pd.read_csv(INPUT_DIR + 'train.csv')
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")

# ====================================================
# CPC Data
# ====================================================
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt', encoding='utf-8') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

cpc_texts = get_cpc_texts()
torch.save(cpc_texts, OUTPUT_DIR+"cpc_texts.pth")
train['context_text'] = train['context'].map(cpc_texts)
test['context_text'] = test['context'].map(cpc_texts)
train['context_text'] = train['context_text'].map(lambda s : s.lower())
test['context_text'] = test['context_text'].map(lambda s : s.lower())

train['text'] = 'anchor:' + train['anchor'] + '[SEP]' + 'target:' + train['target'] + '[SEP]' + 'context:' + train['context_text']
test['text'] = 'anchor:' + test['anchor'] + '[SEP]' + 'target:' + test['target'] + '[SEP]' + 'context:' + test['context_text']

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/deberta-v3-large/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions1 = 0
predictions2 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/deberta-large-cv/deberta_v3_large_BCE_fold_{fold}.pt'))
    predictions1 += val_fn(model, valid_dl, mode = 'BCE')/4
    model.load_state_dict(torch.load(f'../input/deberta-large-cv/deberta_v3_large_MSE_fold_{fold}.pt'))
    predictions2 += val_fn(model, valid_dl, mode = 'MSE')/4

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/bert-for-patents'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions3 = 0
predictions4 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/bert-cv/bert_for_patents_BCE_fold_{fold}.pt'))
    predictions3 += val_fn(model, valid_dl, mode = 'BCE')/4
    model.load_state_dict(torch.load(f'../input/bert-cv/bert_for_patents_MSE_fold_{fold}.pt'))
    predictions4 += val_fn(model, valid_dl, mode = 'MSE')/4

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/deberta-v3-base/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions5 = 0
predictions6 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/deberta-base-cv/deberta_v3_base_BCE_fold_{fold}.pt'))
    predictions5 += val_fn(model, valid_dl, mode = 'BCE')/4
    model.load_state_dict(torch.load(f'../input/deberta-base-cv/deberta_v3_base_MSE_fold_{fold}.pt'))
    predictions6 += val_fn(model, valid_dl, mode = 'MSE')/4

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/deberta-v3-small/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions7 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/deberta-small-cv/deberta_v3_small_MSE_fold_{fold}.pt'))
    predictions7 += val_fn(model, valid_dl, mode = 'MSE')/4

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/ernie-large'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions8 = 0
predictions9 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/ernie-large-cv/ernie_2.0_large_BCE_fold_{fold}.pt'))
    predictions8 += val_fn(model, valid_dl, mode = 'BCE')/4
    model.load_state_dict(torch.load(f'../input/ernie-large-cv/ernie_2.0_large_MSE_fold_{fold}.pt'))
    predictions9 += val_fn(model, valid_dl, mode = 'MSE')/4

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/funnel-large'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions10 = 0
predictions11 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/funnel-cv/funnel_large_BCE_fold_{fold}.pt'))
    predictions10 += val_fn(model, valid_dl, mode = 'BCE')/4
    model.load_state_dict(torch.load(f'../input/funnel-cv/funnel_large_MSE_fold_{fold}.pt'))
    predictions11 += val_fn(model, valid_dl, mode = 'MSE')/4

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions12 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/roberta-large-cv/roberta_large_BCE_fold_{fold}.pt'))
    predictions12 += val_fn(model, valid_dl, mode = 'BCE')/4

In [None]:
# ====================================================
# tokenizer
# ====================================================
CONFIG.MODEL_PATH = '../input/electra-large-discriminator'
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CONFIG.TOKENIZER = tokenizer

# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths

CONFIG.MAX_LENGTH = max(lengths_dict['anchor']) + max(lengths_dict['target']) \
              + max(lengths_dict['context_text']) + 4  # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CONFIG.MAX_LENGTH}")

class TrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].values.tolist()
        self.tokenizer = CONFIG.TOKENIZER
        self.max_length = CONFIG.MAX_LENGTH

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized = self.tokenizer.encode_plus(
            self.texts[index],
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.long(),
            'attention_mask': attention_mask.long(),
        }



class Model(nn.Module):
    def __init__(self, training_mode = 'MSE'):
        super(Model, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_PATH, num_labels=1)

        if training_mode == 'MSE':
            self.loss = nn.MSELoss(reduction="mean")
        else:
            self.loss = nn.BCEWithLogitsLoss(reduction="mean")

    def forward(self, input_ids, attention_mask, targets = None):
        hidden = self.bert(input_ids=input_ids,
                           attention_mask=attention_mask)
        loss = 0
        if targets is not None:
            loss = self.loss(hidden.logits.reshape(-1,1), targets.reshape(-1,1))
            return hidden.logits, loss

        return hidden.logits, loss

def val_fn(model, valid_dataloader, mode = 'MSE'):
    val_loss = 0
    model.eval()
    preds = []
    for step, batch in tqdm(enumerate(valid_dataloader),
                            total=len(valid_dataloader),
                            desc='validing'):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            y_preds, loss = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            if mode == 'MSE':
                preds.append(y_preds.reshape(-1,).to('cpu').numpy())
            else:
                preds.append(y_preds.reshape(-1,).sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions
valid_ds = TrainDataset(test)
valid_dl = DataLoader(valid_ds, batch_size=CONFIG.BATCH_SIZE*10)
torch.manual_seed(CONFIG.SEED)

predictions13 = 0
for fold in range(4):
    model = Model()
    model = model.to(device)
    model.load_state_dict(torch.load(f'../input/electra-large-cv/electra_large_MSE_fold_{fold}.pt'))
    predictions13 += val_fn(model, valid_dl, mode = 'MSE')/4

In [None]:
pred  = 0.07809745*predictions1 
pred += 0.12919135*predictions2
pred += 0.19441621*predictions3 
pred += 0.09985616*predictions4 
pred += 0.01198952*predictions5 
pred += 0.01414211*predictions6 
# pred += 0.00263129*predictions7 
pred += 0.05200046*predictions8 
pred += 0.01081633*predictions9 
pred += 0.08326132*predictions10
pred += 0.04504323*predictions11 
# pred += 0.01514399*predictions12 
pred += 0.10742007*predictions13

final_predictions_1 = pred

'''print(pred[:5])

submission['score'] = pred
submission[['id', 'score']].to_csv('submission.csv', index=False)'''

# pred 2 CV:0.8839

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os
import sys
INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
import os
import gc
import math
import time
import random
from sklearn import preprocessing
from sklearn import model_selection
from dataclasses import dataclass
from typing import Optional
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import scipy.stats as sp

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForTokenClassification
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true


import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
# ====================================================
# Data Loading
# ====================================================
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")

In [None]:
# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load('../input/pppm-debertav3large-baseline/cpc_texts.pth')
test['context_text'] = test['context'].map(cpc_texts)

In [None]:
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# inference(MSE)
# ====================================================
def inference_fn_mse(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
class TransformerHead(nn.Module):
    def __init__(self, in_features, max_length, num_layers=1, nhead=8, num_targets=1):
        super().__init__()

        self.transformer = nn.TransformerEncoder(encoder_layer=nn.TransformerEncoderLayer(d_model=in_features,
                                                                                          nhead=nhead),
                                                 num_layers=num_layers)
        self.row_fc = nn.Linear(in_features, 1)
        self.out_features = max_length

    def forward(self, x):
        out = self.transformer(x)
        out = self.row_fc(out).squeeze(-1)
        return out


# ====================================================
# Model
# ====================================================
class Th(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        self.feature_extractor = AutoModelForTokenClassification.from_pretrained(cfg.model)
        in_features = self.feature_extractor.classifier.in_features
        self.attention = TransformerHead(in_features=in_features, max_length=cfg.max_len, num_layers=1, nhead=8, num_targets=1)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.attention.out_features, self.cfg.target_size)
        self._init_weights(self.fc)
        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        feature = self.attention(last_hidden_states)

        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        # print(feature.shape)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
# ====================================================
# Model(MSE)
# ====================================================
class MSEModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            '''if num == 0:
                self.model = AutoModel.from_pretrained(cfg.model0, config=self.config)
            if num == 1:
                self.model = AutoModel.from_pretrained(cfg.model1, config=self.config)
            if num == 2:
                self.model = AutoModel.from_pretrained(cfg.model2, config=self.config)
            if num == 3:
                self.model = AutoModel.from_pretrained(cfg.model3, config=self.config)'''
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # 取出mask
        attention_mask = inputs['attention_mask']
        attention_mask = attention_mask.unsqueeze(2)
        # 将padding置为0
        feature = last_hidden_states * attention_mask
        # 平均只包含非padding部分
        feature = torch.sum(feature, 1) / torch.sum(attention_mask, dim=1)
        return feature



    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
# ====================================================
# Model
# ====================================================
class ELECTRAModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.attention)
        self.linear = nn.Linear(self.config.hidden_size, 1)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs[0]
        input_mask_expanded = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        out = sum_embeddings / sum_mask

        out = self.layer_norm1(out)
        output = self.fc(out)

        return output

In [None]:
# ====================================================
# Model
# ====================================================
class LUKEModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
# ====================================================
# Model
# ====================================================
class BARTModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = BartConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = BartModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = BartModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# 1.deberta-v3-small-4folds transformer head

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertasmalltransformer-head08440/USPPPM-deberta-small-transformer_head-0.8440/outputconfig.pth'
    model="../input/deberta-v3-small/deberta-v3-small"
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f"../input/uspppmdebertasmalltransformer-head08440/USPPPM-deberta-small-transformer_head-0.8440/output-tmp-Allen-Pycharm-input-deberta-v3-small_fold{fold}_best.pth")
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred1 = np.mean(predictions, axis=0)

# 3.bert for patents 4folds

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmbertforpatents/bert-for-patents/outputconfig.pth'
    tokenizer_path='../input/uspppmbertforpatents/bert-for-patents/outputtokenizer/'
    #model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=117
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f"../input/uspppmbertforpatents/bert-for-patents/output-tmp-pycharm_project_926-input-bert-for-patents_fold{fold}_best.pth")
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred3 = np.mean(predictions, axis=0)

# 4.Deberta large

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertav3large4foldscv08612/uspppm-deberta-large-4folds-0.8612/outputconfig.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmdebertav3large4foldscv08612/uspppm-deberta-large-4folds-0.8612/output-tmp-Allen-Pycharm-input-deberta-v3-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred4 = np.mean(predictions, axis=0)

# 6. Deberta-large-transformer_head

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertaltransformer-head08624/outputconfig.pth'
    model='../input/deberta-v3-large/deberta-v3-large'
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmdebertaltransformer-head08624/USPPPM-deberta-large-transformer_head-0.8624/output-tmp-Allen-Pycharm-input-deberta-v3-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred6 = np.mean(predictions, axis=0)

# 7.deberta-base-transformer_head-0.8503

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertabasetransformer-head08503/USPPPM-deberta-base-transformer_head-0.8503/outputconfig.pth'
    model='../input/deberta-v3-base/deberta-v3-base'
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmdebertabasetransformer-head08503/USPPPM-deberta-base-transformer_head-0.8503/output-tmp-Allen-Pycharm-input-deberta-v3-base_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred7 = np.mean(predictions, axis=0)

# 8.bert for patents Transformer_head-0.8493

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/outputconfig.pth"
    config_path='../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/outputconfig.pth'
    model='../input/bert-for-patents'
    tokenizer_path='../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=117
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/output-tmp-Allen-Pycharm-input-bert-for-patents_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred8 = np.mean(predictions, axis=0)

# 10.Roberta large

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmrobertalarge08406/roberta/config.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmrobertalarge08406/roberta/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmrobertalarge08406/roberta/-tmp-Allen-input-roberta-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred10 = np.mean(predictions, axis=0)

# 11.mnli-deberta-large

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmmnlidebertalarge08504/USPPPM-mnli-deberta-large/config.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmmnlidebertalarge08504/USPPPM-mnli-deberta-large/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmmnlidebertalarge08504/USPPPM-mnli-deberta-large/-root-autodl-tmp-input-deberta-large-mnli_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred11 = np.mean(predictions, axis=0)

# 12.roberta-base

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmrobertabase08063/USPPPM-roberta-base/config.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmrobertabase08063/USPPPM-roberta-base/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmrobertabase08063/USPPPM-roberta-base/-root-autodl-tmp-input-roberta-base_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred12 = np.mean(predictions, axis=0)

# 13.deberta-mnli-th

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmmnlith08477/USPPPM-mnli-th-0.8477/config.pth'
    model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmmnlith08477/USPPPM-mnli-th-0.8477/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmmnlith08477/USPPPM-mnli-th-0.8477/-root-autodl-tmp-input-deberta-large-mnli_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred13 = np.mean(predictions, axis=0)

# 16. MSE Deberta Large

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmmsedebertav3large/MSE-deberta-large-0.8457/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmmsedebertav3large/MSE-deberta-large-0.8457/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = MSEModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmmsedebertav3large/MSE-deberta-large-0.8457/-tmp-Allen-input-deberta-v3-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn_mse(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred16 = np.mean(predictions, axis=0)

# 18.patentSBERTa

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmpatentsberta08313/patentSBERTa/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmpatentsberta08313/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmpatentsberta08313/patentSBERTa/-root-autodl-tmp-input-PatentSBERTa_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred18 = np.mean(predictions, axis=0)

# 19.Electra

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmelectralarge/electra-large-0.8452/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmelectralarge/electra-large-0.8452/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ELECTRAModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmelectralarge/electra-large-0.8452/-root-autodl-tmp-input-electra-large_fold{fold}_best.pth',
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred19 = np.mean(predictions, axis=0)

# 20. ernie

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmernielarge/ernie-large-ml=125/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmernielarge/ernie-large-ml=125/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ELECTRAModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmernielarge/ernie-large-ml=125/-root-autodl-tmp-input-ernie-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred20 = np.mean(predictions, axis=0)

# 21.funnel

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmfunnel/funnel-ml=125/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmfunnel/funnel-ml=125/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ELECTRAModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmfunnel/funnel-ml=125/-root-autodl-tmp-input-funnel_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred21 = np.mean(predictions, axis=0)

# 22.luke

In [None]:
from transformers import LukeTokenizer, LukeModel, LukeConfig

# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmluke/luke-large-ml=175/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmluke/luke-large-ml=175/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = LukeTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = LUKEModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmluke/luke-large-ml=175/studio-ousia-luke-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred22 = np.mean(predictions, axis=0)

# 23.bart large

In [None]:
from transformers import BartTokenizer, BartModel, BartConfig

# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmbartlarge/bart-ml=174/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmbartlarge/bart-ml=174/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=174
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = BartTokenizer.from_pretrained(CFG.tokenizer_path)

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = BARTModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmbartlarge/bart-ml=174/-root-autodl-tmp-input-bart-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred23 = np.mean(predictions, axis=0)

# Pred2 Ensemble

In [None]:
w1 = 0.03605261 # deberta-small-transformer-head
#w2 = 0.01577283 # deberta-base
w3 = 0.18962274 # bert for patents
w4 = 0.11517471 # deberta-large
#w5 = 0.01215114 # deberta-xsmall transformer_head
w6 = 0.13769397 # deberta-large-transformer_head
w7 = 0.01575894 # deberta-base-transformer_head
w8 = 0.08336175 # bert for patents transformer_head
#w9 = 0.01323456 # roberta-large-th
w10 = 0.02831863 # roberta-large
w11 = 0.05526383 # mnli-deberta-large
w12 = 0.00992199 # roberta-base
w13 = 0.02017179 # mnli-th
#w14 = -0.03159355 # mse-deberta-small
#w15 = 0.02173322 # mse-deberta-base
w16 = 0.03500124 # mse-deberta-large
#w17 = 0.00915298 # mse-bert
w18 = 0.0057002 # patentSBERTa
w19 = 0.06964565 # electra
w20 = 0.03321846
w21 = 0.07020563
w22 = 0.02133232
w23 = 0.01152879
'''
0.8838650207947062
'''

In [None]:
from sklearn.preprocessing import MinMaxScaler

MMscaler = MinMaxScaler()

pred1_mm = MMscaler.fit_transform(pred1.reshape(-1,1)).reshape(-1)
#pred2_mm = MMscaler.fit_transform(pred2.reshape(-1,1)).reshape(-1)
pred3_mm = MMscaler.fit_transform(pred3.reshape(-1,1)).reshape(-1)
pred4_mm = MMscaler.fit_transform(pred4.reshape(-1,1)).reshape(-1)
#pred5_mm = MMscaler.fit_transform(pred5.reshape(-1,1)).reshape(-1)
pred6_mm = MMscaler.fit_transform(pred6.reshape(-1,1)).reshape(-1)
pred7_mm = MMscaler.fit_transform(pred7.reshape(-1,1)).reshape(-1)
pred8_mm = MMscaler.fit_transform(pred8.reshape(-1,1)).reshape(-1)
#pred9_mm = MMscaler.fit_transform(pred9.reshape(-1,1)).reshape(-1)
pred10_mm = MMscaler.fit_transform(pred10.reshape(-1,1)).reshape(-1)
pred11_mm = MMscaler.fit_transform(pred11.reshape(-1,1)).reshape(-1)
pred12_mm = MMscaler.fit_transform(pred12.reshape(-1,1)).reshape(-1)
pred13_mm = MMscaler.fit_transform(pred13.reshape(-1,1)).reshape(-1)
#pred14_mm = MMscaler.fit_transform(pred14.reshape(-1,1)).reshape(-1)
#pred15_mm = MMscaler.fit_transform(pred15.reshape(-1,1)).reshape(-1)
pred16_mm = MMscaler.fit_transform(pred16.reshape(-1,1)).reshape(-1)
#pred17_mm = MMscaler.fit_transform(pred17.reshape(-1,1)).reshape(-1)
pred18_mm = MMscaler.fit_transform(pred18.reshape(-1,1)).reshape(-1)
pred19_mm = MMscaler.fit_transform(pred19.reshape(-1,1)).reshape(-1)
pred20_mm = MMscaler.fit_transform(pred20.reshape(-1,1)).reshape(-1)
pred21_mm = MMscaler.fit_transform(pred21.reshape(-1,1)).reshape(-1)
pred22_mm = MMscaler.fit_transform(pred22.reshape(-1,1)).reshape(-1)
pred23_mm = MMscaler.fit_transform(pred23.reshape(-1,1)).reshape(-1)


final_predictions_2 =  pred1_mm * w1 + pred3_mm * w3 + pred4_mm * w4 + pred6_mm * w6 + pred7_mm * w7 + pred8_mm * w8 + pred10_mm * w10 + pred11_mm * w11 + pred12_mm * w12 + pred13_mm * w13 + pred16_mm * w16 + pred18_mm * w18 + pred19_mm * w19 + pred20_mm * w20 + pred21_mm * w21 + pred22_mm * w22 + pred23_mm * w23

In [None]:
a1 = 0.10
a2 = 0.90

final_predictions = a1 * final_predictions_1 + a2 * final_predictions_2

# Submission

In [None]:
PATH = '../input/us-patent-phrase-to-phrase-matching'
sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

In [None]:
sub['score'] = final_predictions
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()