In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import norm

import os

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

In [None]:
!cp -r /kaggle/input/train-baseline-bert-w-o-context/sentence-transformers /tmp/sentence-transformers
!pip install /tmp/sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

modelPath = "/kaggle/input/train-baseline-bert-w-o-context/bert-base/"

model = SentenceTransformer(modelPath)

In [None]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

## Functions

In [None]:
def clean_text(corpus, remove_stop_words = True):
    '''
    Function to clean a given corpus - lower the words, strip of the spaces, remove stopwords and lemmatize the corpus
    Args:
        corpus: the text to be cleaned
        remove_stop_words: whether to remove stopwords
    Returns:
        filtered_sentence: cleaned corpus
    '''
    corpus = corpus.lower().strip()
    word_tokens = word_tokenize(corpus)
    if remove_stop_words:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus)) if i not in stop_words])
    else:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus))])
    return filtered_sentence

def cosine(a,b):
    '''
    Function to calculate cosine similarity of two vectors
    Args:
        a,b: vectors to calculate cosine between
    Returns:
        cosine similarity of the given vectors
    '''
    return np.dot(a,b)/(norm(a)*norm(b))

In [None]:
test_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
test_df.head()

In [None]:
test_df["anchor"] = test_df["anchor"].apply(lambda x: clean_text(x,False))
test_df["target"] = test_df["target"].apply(lambda x: clean_text(x,False))

In [None]:
anchors = test_df["anchor"].to_list()
anchor_embed = model.encode(anchors,show_progress_bar=True, batch_size=256)
targets = test_df["target"].to_list()
target_embed = model.encode(targets,show_progress_bar=True, batch_size=256)

In [None]:
# Calculating the cosine similarity between the anchir and target embeddings
sims = [cosine(i[0],i[1]) for i in zip(anchor_embed,target_embed)]

In [None]:
df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [None]:
df.head()

In [None]:
submission_df = test_df.drop(columns = ["anchor","target","context"])

In [None]:
submission_df["score"] = sims

In [None]:
submission_df.to_csv("submission.csv",index=False)