In [None]:
import os
from pathlib import Path
import numpy as np 
import pandas as pd 
from typing import List, Dict, Union
from typing import Any, TypeVar

Pathable = Union[str, Path]

In [None]:
os.listdir('..')

In [None]:
os.listdir('../input/')

In [None]:
HOME_PATH: str = '../input/us-patent-phrase-to-phrase-matching'
_files = list(Path(HOME_PATH).rglob("*.csv"))

In [None]:
_files

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

### Brief

- [Bert based uncased](https://huggingface.co/bert-base-uncased)
- [Cosine similarity intuition](https://stats.stackexchange.com/questions/256778/cosine-similarity-intuition/476695#476695)
- [Cosine similarity definition](https://en.wikipedia.org/wiki/Cosine_similarity)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",)
model = AutoModel.from_pretrained("bert-base-uncased",output_hidden_states=True)

In [None]:
model = model.cuda()

In [None]:

def get_embeddings(text,token_length):
  tokens=tokenizer(text,max_length=token_length,padding='max_length',truncation=True)
  output=model(torch.tensor(tokens.input_ids).unsqueeze(0).cuda(),
               attention_mask=torch.tensor(tokens.attention_mask).unsqueeze(0).cuda()).hidden_states[-1]
  return torch.mean(output,axis=1).detach().cpu().numpy()

In [None]:
# Calculate cosine similarity based on two inputs. 
def calculate_similarity(text1: str,text2: str,token_length=20):
    out1=get_embeddings(text1,token_length=token_length)#create embeddings of text
    out2=get_embeddings(text2,token_length=token_length)#create embeddings of text
    sim1= cosine_similarity(out1,out2)[0][0]
    #sim2= cosine_similarity(out2,out3)[0][0]
    return sim1
    

## Lets read training dataset

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv(_files[1])

In [None]:
res: List = []

In [None]:
from torch import Tensor

In [None]:
from tqdm import tqdm

In [None]:
for el in tqdm(range(train.shape[0])):
    res.append(calculate_similarity(train.anchor[el], train.target[el]))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize = (8, 8))

sns.displot(res)
plt.title('Distribution of predicted values.')
plt.grid()

In [None]:
_sub = pd.DataFrame(train.id)

In [None]:
_sub['pred'] = pd.DataFrame(np.array(res).T)

In [None]:
_sub.dtypes

In [None]:
_sub.to_csv('submission.csv', float_format='%.3f', index= False )

In [None]:
import datasets

In [None]:
submission = datasets.Dataset.from_dict({
    'id': _sub.id,
    'score': _sub.pred,
})

In [None]:
submission.to_csv('submission.csv', index=False)