# original notebook

https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-inference

If you find this useful, please consider upvoting his/her notebook as well.

thanks for your sharing https://www.kaggle.com/ksork6s4



# BERT for Patents Baseline

- [kfold strategy](https://www.kaggle.com/code/abhishek/phrase-matching-folds)
- Utilize [Cooperative Patent Classification Codes Meaning](https://www.kaggle.com/datasets/xhlulu/cpc-codes)
- reference [phantivia'Notebook](https://www.kaggle.com/code/phantivia/uspppm-huggingface-train-inference-baseline)
- [BERT for Patents](https://www.kaggle.com/datasets/ksork6s4/bert-for-patents) from [huggingface page](https://huggingface.co/anferico/bert-for-patents)

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ["WANDB_DISABLED"] = "true"

## Config

In [None]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/patentsberta-finetune-5fold/uspppm_4'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 64

## Preproc

In [None]:
test_df = pd.read_csv(f"{CFG.input_path}test.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
test_df = test_df.merge(titles, left_on='context', right_on='code')

In [None]:
test_df['input'] = test_df['title']+' '+test_df['anchor']

## Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

## Dataset

In [None]:
class InferDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        return {
        **tokenizer( inputs, targets ),
        'label': -1
    }

## Inference

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
te_dataset = InferDataset(test_df)
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
trainer = Trainer(
        model,
        tokenizer=tokenizer
    )

outputs = trainer.predict(te_dataset)
predictions = outputs.predictions.reshape(-1)

predictions[predictions<0] = 0
predictions[predictions>1.0] = 1.0

submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': predictions,
})

submission.to_csv('submission.csv', index=False)