In [None]:
# Importing required packages
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()

In [None]:
# Loading Data
train_df = pd.read_csv("../input/usppp-kfold/usppp_kfold.csv")
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
submission_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../input/patentsberta/PatentSBERTa')
model = AutoModel.from_pretrained('../input/patentsberta/PatentSBERTa')

In [None]:
def cls_pooling(model_output, attention_mask):
    return model_output[0][:,0]

In [None]:
def get_similarity(sent1, sent2):
    encoded_input = tokenizer([sent1, sent2], padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])
    
    result = cosine_similarity(sentence_embeddings[0].reshape(1,-1), sentence_embeddings[1].reshape(1,-1))
    return result[0][0]

In [None]:
test_df['score'] = test_df.progress_apply(lambda row: get_similarity(row['anchor'], row['target']), axis=1)

In [None]:
final_submission = test_df.drop(['anchor', 'target', 'context'], axis=1)

In [None]:
final_submission.to_csv('submission.csv', index=False)