train code: https://www.kaggle.com/code/columbia2131/uspppm-roberta-base-colab-baseline-train/notebook

In [None]:
import os
from glob import glob
import torch

class CFG_exp001:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len = 64
    batch_size = 128
    MODEL_PATH = '../input/roberta-base'
    WEIGHT_PATH = '../input/usp-exp001-roberta-base-epoch10/model'
    model_prefix = 'baseline_'
    model_weights = glob(os.path.join(WEIGHT_PATH, '*.pth'))

In [None]:
# ========================================
# Library
# ========================================
import os
import gc
import sys
import joblib
import random
import warnings
import itertools
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [None]:
# =====================
# Dataset, Model
# =====================
def processing_features(df):
    df['alp_context'] = df['context'].map(lambda x: x[0])
    df['num_context'] = df['context'].map(lambda x: int(x[1:]))
    df['alp_context'] = df['alp_context'].map({
        'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':'7'
    }).astype(int)
    return df


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.anchor = df['anchor'].to_numpy()
        self.target = df['target'].to_numpy()
        self.alp_context = df['alp_context'].to_numpy()
        self.num_context = df['num_context'].to_numpy()
        
    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, index):
        inputs = self.prepare_input(
            self.cfg, 
            self.anchor[index], 
            self.target[index]
        )        
        alps = torch.tensor(
            self.alp_context[index],
            dtype=torch.long
        )
        nums = torch.tensor(
            self.num_context[index],
            dtype=torch.long
        )
        return inputs, alps, nums
    
    @staticmethod
    def prepare_input(cfg, anchor_text, target_text):
        inputs = cfg.tokenizer(
            anchor_text, 
            target_text, 
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
        )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collatte(inputs, labels=None):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    if not labels is None:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        labels =  labels[:,:mask_len]
        return inputs, labels, mask_len
                
    else:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        return inputs, mask_len


class Exp001Model(nn.Module):
    def __init__(self, cfg, num_alp=9, emb_alp=8, num_num=100, emb_num=8):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH, 
            config=self.config
        )
        self.embedding_alp = nn.Embedding(
            num_embeddings=num_alp,
            embedding_dim=emb_alp,
        )
        self.embedding_num = nn.Embedding(
            num_embeddings=num_num,
            embedding_dim=emb_num,
        )
        
        self.linear1 = nn.Sequential(
            nn.Linear(self.config.hidden_size+emb_alp+emb_num, 1024),
            nn.SELU(),
            nn.Linear(1024, 1024),
            nn.SELU(),
            nn.Linear(1024, 1)
        )

    def forward(self, inputs, alps, nums):
        outputs = self.backbone(**inputs)["last_hidden_state"]
        outputs = outputs[:, 0, :]
        alp_outputs = self.embedding_alp(alps)
        num_outputs = self.embedding_num(nums)
        
        outputs = torch.cat([outputs, alp_outputs, num_outputs], axis=1)
        outputs = self.linear1(outputs)
        return outputs.flatten()

In [None]:
def inferring(cfg, test, custom_model):
    sub_pred = np.zeros(len(test), dtype=np.float32)
    
    for model_weight in cfg.model_weights:
        # dataset, dataloader
        test_dataset = TestDataset(cfg, test)
        test_loader = DataLoader(
            dataset=test_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=False,
            pin_memory=True
        )
        
        # model
        model = custom_model(cfg)
        model.load_state_dict(torch.load(model_weight))
        model = model.to(cfg.device)
        
        # evaluation
        model.eval()
        tmp_pred = []
        with torch.no_grad():
            for (inputs, alps, nums) in tqdm(test_loader, total=len(test_loader)):
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                alps = alps.to(cfg.device)
                nums = nums.to(cfg.device)
                with autocast():
                    output = model(inputs, alps, nums)
                output = output.detach().cpu().numpy()      
                tmp_pred.append(output)
        tmp_pred = np.concatenate(tmp_pred)
        sub_pred = sub_pred + tmp_pred / len(cfg.model_weights)

    return sub_pred

In [None]:
# =====================
# Main
# =====================
train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')
sub = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')

train = processing_features(train)
test = processing_features(test)

exp_ensemble = []
for (cfg_model, custmom_model) in [
    (CFG_exp001, Exp001Model),]:
    cfg_model.tokenizer = AutoTokenizer.from_pretrained(cfg_model.MODEL_PATH)
    exp_pred = inferring(cfg_model, test, custmom_model)
    exp_ensemble.append(exp_pred)
    
exp_ensemble = exp_ensemble[0]
sub['score'] = exp_ensemble
sub.to_csv('submission.csv', index=False)