In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil
import json
import random
import time
import datetime

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ["WANDB_DISABLED"] = "true"

In [2]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/uspppm-debertv3large-5folds-v2/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 64

In [4]:
test_df = pd.read_csv(f"{CFG.input_path}test.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
test_df = test_df.merge(titles, left_on='context', right_on='code')

In [5]:
test_df['input'] = test_df['title']+'[SEP]'+test_df['anchor']

In [6]:
tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path}uspppm_0')

In [7]:
class InferDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        return {
        **tokenizer( inputs, targets )
    }

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [9]:
predictions = []

for fold in range(CFG.num_fold):
    te_dataset = InferDataset(test_df)
    model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path}uspppm_{fold}', num_labels=1)
    trainer = Trainer(
            model,
            tokenizer=tokenizer
        )

    outputs = trainer.predict(te_dataset)
    prediction = outputs.predictions.reshape(-1)
    predictions.append(prediction)
    
predictions = np.mean(predictions, axis=0)
submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': predictions,
})

submission.to_csv('submission.csv', index=False)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running Prediction *****
  Num examples = 36
  Batch size = 8
100%|██████████| 5/5 [00:00<00:00, 15.04it/s]loading configuration file ../input/uspppm-debertv3large-5folds-v2/uspppm_1\config.json
Model config DebertaV2Config {
  "_name_or_path": "../input/uspppm-debertv3large-5folds-v2/uspppm_1",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 

1054

100%|██████████| 5/5 [00:17<00:00, 17.10it/s]