In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q sentence_transformers --no-index --find-links /kaggle/input/all-distilroberta-v1-train/site_packages

In [None]:
import numpy as np 
import pandas as pd

test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv') 
test['target_context'] = test[['target','context']].agg(' '.join, axis=1) 
test = test.drop(['id','target','context'],axis=1) 
test.head()

In [None]:
import sys 
model_path = '../input/all-distilroberta-v1-train/all-distilroberta-v1'
#tokenizer_path = sys.path.append("../input/all-distilroberta-v1-train/all-distilroberta-v1/tokenizer.json") 

In [None]:
test.anchor.tolist()

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = test.anchor.tolist()

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer(model_path)
anchor_vec = model.encode(test.anchor.tolist())
target_vec = model.encode(test.target_context.tolist())
cos_sim = []
for i in range(len(anchor_vec)):
    sim = util.cos_sim(anchor_vec[i], target_vec[i])
    cos_sim.append(sim[0][0])

In [None]:
cos_sim_score = []
for i in range(len(cos_sim)):
    cs = cos_sim[i].item()
    cos_sim_score.append(cs)

In [None]:
cos_sim_score

In [None]:
sample = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
sample.head()

In [None]:
data = {'id': sample.id, 'score': cos_sim_score}
    
    
submission = pd.DataFrame(data)
submission.to_csv('submission.csv', index=False)
submission.head()