This problem is similar to Natural language inference, but on patent phrases instead of sentences. It is a hard problem as the dataset is relatively small while to determine semantic similarity between phrases we need to encode information about domain context. Fortunately we have a pretrained BERT model trained on 100M+ patents released by Google [https://cloud.google.com/blog/products/ai-machine-learning/how-ai-improves-patent-analysis](https://cloud.google.com/blog/products/ai-machine-learning/how-ai-improves-patent-analysis). With this model, we can cast our problem to a classification problem where the model needs to predict among 5 classes corresponding to 5 levels of matching and the final score is the weighted average over 5 levels. Input to the model is simply the concatenation of [Cooperative Patent Classification, anchor, target].

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('../input/patent-phrase-matching/patent_phrase/checkpoint-1026', num_labels=5, local_files_only=False)

tokenizer = AutoTokenizer.from_pretrained('../input/patent-phrase-matching/patent_phrase/checkpoint-1026')

In [None]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
def encode_row(row, test=False):
    ret = tokenizer(row['context'][0] + ' ' + row['anchor'], row['target'])
    if not test:
        ret['label'] = np.digitize(row['score'], bins=np.linspace(0, 1, 5)) - 1
    
    return ret

test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
test_data = [encode_row(row, test=True) for _, row in test_df.iterrows()]
testset = MyDataset(test_data)

In [None]:
trainer = Trainer(model,tokenizer=tokenizer)

In [None]:
outputs = trainer.predict(testset)

prob = np.exp(outputs.predictions)
prob = prob / np.sum(prob, axis=1, keepdims=True)
pred = prob * np.linspace(0, 1, 5)
pred = np.sum(pred, axis=1)

submit = pd.DataFrame({'id': test_df['id'], 'score': pred})
submit.to_csv('submission.csv', index=False)