In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup,AdamW
from transformers import AutoModel,AutoModelForSequenceClassification,AutoTokenizer

scaler = torch.cuda.amp.GradScaler() # GPUでの高速化。

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cpuがgpuかを自動判断
device

In [None]:
#各種パラメータ設定
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':64,
    'valid_step':10,
    'max_len':128,
    'epochs':5,
    'nfolds':5,
    'seed':42,
    'model_dir':"../input/roberta-transformers-pytorch/roberta-base",
}

In [None]:
def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

random_seed(config["seed"])

In [None]:
MODEL_DIR = "../input/uspppm-roberta-finetune/"
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
submit = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
titles = pd.read_csv("../input/cpc-codes/titles.csv")

test = test.merge(titles, left_on='context', right_on='code')
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['title']
test = test[["id","anchor","target","context","text"]]

In [None]:
class USPPPMDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.df = df

        self.include_labels = include_labels

        self.text = df["text"].tolist()

        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = config["max_len"],
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df["score"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

In [None]:
class USPPPMModel(nn.Module):
    def __init__(self,model_name):
        super().__init__()

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1)
        
#         self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)[0].squeeze()
        
#         output = self.sigmoid(model_output.logits).squeeze()

        return output

In [None]:
def inference():
    predictions = []
    
    test_dataset = USPPPMDataset(test, config["model_dir"], include_labels=False)
    test_loader = DataLoader(
        test_dataset, batch_size=config["batch_size"], shuffle=False, num_workers=4, pin_memory=True
    )

    for fold in range(config["nfolds"]):
        print(f"========== model: RoBerta fold: {fold} inference ==========")
        model = USPPPMModel(config["model_dir"])
        model.to(device)
        check_point = torch.load(MODEL_DIR + f"model{fold}/model{fold}.pth")
        model.load_state_dict(check_point["model"])
        model.eval()
        preds = []
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.sigmoid().to("cpu").numpy())
        preds = np.concatenate(preds)
        predictions.append(preds)
        
    predictions = np.mean(predictions, axis=0)
    
    return predictions

In [None]:
# Inference
predictions = inference()

In [None]:
predictions

In [None]:
# submission
test["score"] = predictions
submit = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
submit = pd.merge(submit,test,on="id")
submit = submit[["id","score"]]
submit.to_csv("./" + "submission.csv",index=False)

In [None]:
submit