# BERT for Patents Baseline

- [kfold strategy](https://www.kaggle.com/code/abhishek/phrase-matching-folds)
- Utilize [Cooperative Patent Classification Codes Meaning](https://www.kaggle.com/datasets/xhlulu/cpc-codes)
- reference [phantivia'Notebook](https://www.kaggle.com/code/phantivia/uspppm-huggingface-train-inference-baseline)
- [BERT for Patents](https://www.kaggle.com/datasets/ksork6s4/bert-for-patents) from [huggingface page](https://huggingface.co/anferico/bert-for-patents)


|version|model|val strategy|CV|LB|
|---|---|---|---|---|
|02| BERT for Patents| hold out| 0.85 | 0.815 |
|05| BERT for Patents| 5folds| 0.853 | 0.825 |
|07| debert-v3-large| 5folds| 0.867 | 0.833 | 
|09| debert-v3-large| 5folds| 0.862 | 0.836 | 
|12| debert-v3-large| 5folds| 0.862 | 0.837 | 

### Please refer to [Training Notebook](https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-train) as well.


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ["WANDB_DISABLED"] = "true"

# Config

In [None]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/uspppm-debertv3large-5folds-v2/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 64

# Preproc

In [None]:
test_df = pd.read_csv(f"{CFG.input_path}test.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
test_df = test_df.merge(titles, left_on='context', right_on='code')

In [None]:
test_df['input'] = test_df['title']+'[SEP]'+test_df['anchor']

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path}uspppm_0')

# Dataset

In [None]:
class InferDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        return {
        **tokenizer( inputs, targets )
    }

# Inference

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
predictions = []

for fold in range(CFG.num_fold):
    te_dataset = InferDataset(test_df)
    model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path}uspppm_{fold}', num_labels=1)
    trainer = Trainer(
            model,
            tokenizer=tokenizer
        )

    outputs = trainer.predict(te_dataset)
    prediction = outputs.predictions.reshape(-1)
    predictions.append(prediction)
    
predictions = np.median(predictions, axis=0)
submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': predictions,
})

submission.to_csv('submission.csv', index=False)