# In this notebook we attempt a solution as a baseline 

### We will be using BERT as the model and computing the cosine similarity
### between the anchor and text without considering the context

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import norm
from scipy.stats import pearsonr

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

In [None]:
!pip install -U sentence-transformers

In [None]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
train_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv").drop(columns=["id"])
test_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv").drop(columns=["id"])

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df["anchor"].nunique(), train_df["context"].nunique(), train_df["target"].nunique()

## Functions:

In [None]:
def clean_text(corpus, remove_stop_words = True):
    '''
    Function to clean a given corpus - lower the words, strip of the spaces, remove stopwords and lemmatize the corpus
    Args:
        corpus: the text to be cleaned
        remove_stop_words: whether to remove stopwords
    Returns:
        filtered_sentence: cleaned corpus
    '''
    corpus = corpus.lower().strip()
    word_tokens = word_tokenize(corpus)
    if remove_stop_words:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus)) if i not in stop_words])
    else:
        filtered_sentence = " ".join([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(corpus))])
    return filtered_sentence

def cosine(a,b):
    '''
    Function to calculate cosine similarity of two vectors
    Args:
        a,b: vectors to calculate cosine between a and b
    Returns:
        cosine similarity of the given vectors
    '''
    return np.dot(a,b)/(norm(a)*norm(b))

In [None]:
train_df["anchor"] = train_df["anchor"].apply(lambda x: clean_text(x,False))
train_df["target"] = train_df["target"].apply(lambda x: clean_text(x,False))

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

modelPath = "/kaggle/working/bert-base"

model.save(modelPath)
# model = SentenceTransformer(modelPath)

In [None]:
anchors = train_df["anchor"].to_list()
anchor_embed = model.encode(anchors,show_progress_bar=True, batch_size=128)##explore normalize embeddings param

In [None]:
targets = train_df["target"].to_list()
target_embed = model.encode(targets,show_progress_bar=True, batch_size=128)##explore normalize embeddings param

In [None]:
sims = [cosine(i[0],i[1]) for i in zip(anchor_embed,target_embed)]

In [None]:
max_val = max(sims)
min_val = min(sims)
sim_norm = (sims-min_val)/(max_val-min_val)
sim_norm = np.floor(sim_norm*4)/4

In [None]:
y = np.array(train_df["score"].to_list())

In [None]:
corr,_ = pearsonr(y,sim_norm)
print(corr)

## Although the obtained correlation isn't that good, but its good enough for a baseline solution 
## without context and build up from this point

### Following lines are to get the sentence-transfromer package as a github repo

In [None]:
!git clone https://github.com/UKPLab/sentence-transformers.git

In [None]:
os.listdir("/kaggle/working/")