In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import transformers
from transformers import RobertaTokenizer, TFRobertaModel
transformers.logging.set_verbosity_error()
import re

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
traindf=pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
testdf=pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
print(traindf.shape)
print(testdf.shape)

In [None]:
traindf["score"].value_counts().plot(kind="bar")

In [None]:
traindf.head()

In [None]:
traindf['Score_class'] = label_encode.fit_transform(traindf['score'])

In [None]:
traindf["cate"]=traindf['context'].apply(lambda x: x[0])
testdf["cate"]=testdf['context'].apply(lambda x: x[0])
traindf.head()

In [None]:
testdf.head()

Getting Support data from the https://www.kaggle.com/datasets/xhlulu/cpc-codes


In [None]:
cpc = pd.read_csv('../input/cpc-codes/titles.csv')
cpc.head()

In [None]:
def clean(x):
    t = x.lower()
    t = t.replace("[",'')
    t = t.replace(";",'')
    t = t.replace(",",'')
    t = t.replace("]",'')
    t = t.replace(":",'')
    t = t.replace("(",'')
    t = t.replace(")",'')
    t = t.replace("{",'')
    t = t.replace("}",'')
    t = t.replace("/",' ')
    t = t.replace("-",' ')
    return t

cpc['title'] = cpc['title'].apply(lambda x: clean(x))

In [None]:
cpc.head()

#### Getting the broader Cateogry of titles related to CONTEXT of ANCHOR and TARGET

In [None]:
cpc = cpc.rename(columns = {"code" : "context"})
traindf = pd.merge(traindf, cpc[["context","title"]], on ="context", how = "left")
testdf = pd.merge(testdf, cpc[["context","title"]], on ="context", how = "left")

In [None]:
cpc = cpc.rename(columns = {"context" : "cate","title":"broad_title"})
traindf = pd.merge(traindf, cpc[["cate","broad_title"]], on ="cate", how = "left")
testdf = pd.merge(testdf, cpc[["cate","broad_title"]], on ="cate", how = "left")

In [None]:
traindf.head(5)

In [None]:
traindf = traindf.sample(frac = 1).reset_index(drop=True)
traindf.head(5)

In [None]:
traindf["cate"].value_counts().plot(kind="bar")

* traindf1 - joining anchor and title
* traindf2 - joining anchor and broad title

In [None]:
traindf["anchor_title"] = traindf["anchor"].astype(str) + " " + traindf["title"].astype(str)

In [None]:
traindf.iloc[0][7]

In [None]:
traindf["anchor_broad_title"] = traindf["anchor"].astype(str) + " " + traindf["broad_title"].astype(str)

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()

def lemma_stopwords(sent):
    new_sent=[]
    filtered_words = [word for word in sent.split(" ") if word not in stopwords.words('english')]
    for x in filtered_words:
        new_sent.append(lemmatizer.lemmatize(x, pos ="v"))
    return " ".join(new_sent) 

In [None]:
lemma_stopwords('combustion engines hot gas or combustion product engine plants')

In [None]:
traindf["anchor_title"]=traindf["anchor_title"].apply(lambda x: lemma_stopwords(x))
traindf["anchor_broad_title"]=traindf["anchor_broad_title"].apply(lambda x: lemma_stopwords(x))

In [None]:
traindf.head()

### Text cleanup

In [None]:
max_len = 128

In [None]:
def create_data(id_, anchor_title, target, score, train=True) :
    input_ids = []
    attention_mask = []
    labels = []
    ids = []
    tok_txt = tokenizer.batch_encode_plus(
                           [(word[0], word[1]) for word in zip(anchor_title, target)],                           
                           max_length = max_len, 
                           padding='max_length',
                           truncation=True)    
    for i in range(len(anchor_title)): 
        ids.append(id_[i])
        input_ids.append(tok_txt['input_ids'][i])
        attention_mask.append(tok_txt['attention_mask'][i])
        if train:
            labels.append(score[i])
    return {"input_ids":input_ids,
            "attention_mask":attention_mask,
            "ids":ids, 
            }, labels

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("../input/roberta-base/")

In [None]:
train_data, train_labels = create_data(traindf['id'], traindf['anchor_title'], 
                                       traindf['target'], traindf['score'], train=True)


In [None]:
def build_model():
    
    model_ids = Input(shape=(max_len, ), dtype = tf.int32)
    model_mask = Input(shape=(max_len, ), dtype = tf.int32)
    
    roberta_model = TFRobertaModel.from_pretrained("../input/roberta-base/")
    
    x = roberta_model(input_ids = model_ids, 
                      attention_mask = model_mask)       
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)    
    outputs = Dense(1)(x)
    
    model = tf.keras.Model(inputs = [model_ids, model_mask], outputs = outputs)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mse"])
    return model

In [None]:
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch == 0:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.9**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
model = build_model()
model.fit((np.array(train_data['input_ids']),
           np.array(train_data['attention_mask'])),
        np.array(train_labels).ravel(), 
        epochs = 10,
        shuffle=True,
        callbacks = [EarlyStopping(monitor='val_mse', patience=3, restore_best_weights=True), 
                     ModelCheckpoint('roberta_uspppm.h5', monitor='val_mse', 
                                     save_best_only=True, save_weights_only=True), 
                     callback_lr],                     
        batch_size = 16,
        validation_split=0.2 )

In [None]:
testdf["anchor_title"] = testdf["anchor"].astype(str) + " " + testdf["title"].astype(str)
testdf["anchor_broad_title"] = testdf["anchor"].astype(str) + " " + testdf["broad_title"].astype(str)
testdf["anchor_title"]=testdf["anchor_title"].apply(lambda x: lemma_stopwords(x))
testdf["anchor_broad_title"]=testdf["anchor_broad_title"].apply(lambda x: lemma_stopwords(x))

In [None]:
testdf.head()

In [None]:
test_data, test_labels = create_data(testdf['id'], testdf['anchor_title'], 
                                     testdf['target'], None, train=False)

In [None]:
test_preds = model.predict((np.array(test_data['input_ids']),
                            np.array(test_data['attention_mask'])))

In [None]:
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission['score'] = test_preds
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)


In [None]:
submission.describe()

In [None]:
def convert_pred(num):
    if num>=0 and num<=0.15:
        return 0
    elif num>0.15 and num<=0.35:
        return 0.25
    elif num>0.35 and num<=0.65:
        return 0.5
    elif num>0.65 and num<=0.85:
        return 0.75
    else:
         return 1

In [None]:
submission["score"]=submission["score"].apply(lambda x: convert_pred(x))
submission.head(10)

In [None]:
submission.to_csv('submission.csv',index=False)

# In- Progress

* https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1
* https://www.analyticsvidhya.com/blog/2021/05/measuring-text-similarity-using-bert/
* https://www.kaggle.com/code/junjitakeshima/uspppm-simple-roberta-starter-eng