In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U sentence-transformers

In [None]:
!pip install datasets

# Imports

In [None]:
import datasets
from datasets import load_dataset, Dataset, DatasetDict

from torch.utils.data import DataLoader

from sentence_transformers import SentenceTransformer, InputExample, losses

from sentence_transformers import  util
from sentence_transformers import evaluation

from sklearn.model_selection import train_test_split

In [None]:
train= pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test= pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

# Dataset prep

In [None]:
# Split dataset 
X_train, X_test, y_train, y_test= train_test_split(train.iloc[:,:-1],train.iloc[:,-1:], random_state= 42 )

train_ds_df= pd.concat([X_train, y_train], axis=1).reset_index()
print(train_ds_df.head())

val_ds_df= pd.concat([X_test, y_test], axis=1).reset_index()
print(val_ds_df.head())

In [None]:
# Initialize HF train dataset 
train_ds = Dataset.from_pandas(train_ds_df)
train_ds

In [None]:
# Initialize HF train dataset 
train_ds= train_ds.remove_columns(['id', 'context', 'index'])
train_ds

In [None]:
# Initialize HF val dataset 
val_ds = Dataset.from_pandas(val_ds_df)
val_ds

In [None]:
# Initialize HF val dataset 
val_ds= val_ds.remove_columns(['id', 'context', 'index'])
val_ds

In [None]:
# Create training examples
train_examples= []

for row in train_ds:
    train_examples.append(InputExample(texts= [row['anchor'], row['target']], label=row['score']))

# Train sentence Transformer

In [None]:
# Initialize dataloader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Initialize evaluator
evaluator = evaluation.EmbeddingSimilarityEvaluator(list(val_ds_df['anchor']), list(val_ds_df['target']), list(val_ds_df['score']), write_csv = True)

# Initialize model
model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

# Initialize loss
train_loss = losses.CosineSimilarityLoss(model)

In [None]:
#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=6, warmup_steps=100, evaluator=evaluator, evaluation_steps=500)

# Preason corr on val data

In [None]:
val_anchors=[]
val_targets=[]

for index, row in val_ds_df.iterrows():
    val_anchors.append(row['anchor'])
    val_targets.append(row['target'])

In [None]:
queries_embedding = model.encode(val_anchors)
passages_embedding = model.encode(val_targets)

In [None]:
# get similarity scores
scores= []
for i in range(len(queries_embedding)):
    scores.append(util.dot_score(queries_embedding[i], passages_embedding[i]))

In [None]:
scores_nd= [score.numpy() for score in scores]
scores_nd= np.concatenate( scores_nd, axis=0 )
scores_nd= scores_nd.reshape((len(scores_nd),))
print(scores_nd.shape)

In [None]:
# Get ground truth scores
val_scores= np.array(val_ds_df['score'])

In [None]:
# Def pearson corr
def corr(eval_pred): return {'pearson': np.corrcoef(*eval_pred)[0][1]}

In [None]:
corr((scores_nd, val_scores))