This problem is similar to Natural language inference, but on patent phrases instead of sentences. It is a hard problem as the dataset is relatively small while to determine semantic similarity between phrases we need to encode information about domain context. Fortunately we have a pretrained BERT model trained on 100M+ patents released by Google [https://cloud.google.com/blog/products/ai-machine-learning/how-ai-improves-patent-analysis](https://cloud.google.com/blog/products/ai-machine-learning/how-ai-improves-patent-analysis). With this model, we can cast our problem to a classification problem where the model needs to predict among 5 classes corresponding to 5 levels of matching and the final score is the weighted average over 5 levels. Input to the model is simply the concatenation of [Cooperative Patent Classification, anchor, target].

In [None]:
!pip install datasets

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('anferico/bert-for-patents', num_labels=5, local_files_only=False)

tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')

In [None]:
!ls /root/.cache/huggingface/transformers/

In [None]:
def encode_row(row, test=False):
    ret = tokenizer(row['context'][0] + ' ' + row['anchor'], row['target'])
    if not test:
        ret['label'] = np.digitize(row['score'], bins=np.linspace(0, 1, 5)) - 1
    
    return ret

data = datasets.Dataset.from_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
dataset = data.map(encode_row, remove_columns= ['id', 'anchor', 'target', 'context', 'score'])
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    prob = np.exp(predictions)
    prob = prob / np.sum(prob, axis=1, keepdims=True)
    pred = prob * np.linspace(0, 1, 5)
    pred = np.sum(pred, axis=1)
    
    labels = np.linspace(0, 1, 5)[labels]

    return {'pearson': np.corrcoef(pred, labels)[0][1]}


train_args = TrainingArguments(
    f"patent_phrase",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=1e-3,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
)

trainer = Trainer(
    model,
    train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
test_set = datasets.Dataset.from_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
test_process = lambda x: encode_row(x, test=True)
encoded_test = test_set.map(test_process, remove_columns= ['id', 'anchor', 'target', 'context'])
outputs = trainer.predict(encoded_test)

prob = np.exp(outputs.predictions)
prob = prob / np.sum(prob, axis=1, keepdims=True)
pred = prob * np.linspace(0, 1, 5)
pred = np.sum(pred, axis=1)

submit = pd.DataFrame({'id': test_set['id'], 'score': pred})
submit.to_csv('submission.csv', index=False)

In [None]:
trainer.save_model('/kaggle/output/bert-for-patents')