In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Config

In [None]:
model_name = "../input/debertalarge"
max_len = 40
NUM_EPOCHS = 10
batch_size=8
LR = 1e-5

# Reading the training and test datasets

In [None]:
train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
train.head()

In [None]:
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
test.head()

# Importing *Cooperative Patent Classification Codes Meaning* dataset
More details regarding the dataset can be found [here](https://www.kaggle.com/datasets/xhlulu/cpc-codes)

In [None]:
title = pd.read_csv("../input/cpc-codes/titles.csv")
title.head()

# Merging the CPCC dataset with both of the training and test datasets

In [None]:
train = pd.merge(train, title[["code","title"]], 
                 left_on = "context", right_on = "code",
                 how='left')

test = pd.merge(test, title[["code","title"]], 
                 left_on = "context", right_on = "code",
                 how='left')

In [None]:
train.head()

In [None]:
test.head()

# Performing data pre-processing on the training and test set data
#### Dropping the *context* column

In [None]:
train = train.drop(["context"],axis=1)
train.head()

In [None]:
test = test.drop(["context"],axis=1)
test.head()

**Transform the *anchor* and *target* features into lowercase.**

In [None]:
train['anchor'] = train['anchor'].str.lower()
train['target'] = train['target'].str.lower()
test['anchor'] = test['anchor'].str.lower()
test['target'] = test['target'].str.lower()
train.head()

**Removing punctuations from *anchor* and *target* features**

In [None]:
train['anchor'] = train['anchor'].str.replace('[^\w\s]','',regex=True)
train['target'] = train['target'].str.replace('[^\w\s]','',regex=True)
test['anchor'] = test['anchor'].str.replace('[^\w\s]','',regex=True)
test['target'] = test['target'].str.replace('[^\w\s]','',regex=True)
train.head()

**Removing punctutations from the *title* feature**

In [None]:
import re
train["title"] = train["title"].apply(lambda x:re.sub('[;,]', '', x))
test["title"] = test["title"].apply(lambda x:re.sub('[;,]', '', x))

In [None]:
train.head()

In [None]:
test.head()

# K-fold cross validation
Implementing the stratified K-fold cross validation method in our training set. The logic behind the following code has been taken from the following Kaggle Notebook given [here](https://www.kaggle.com/code/abhishek?scriptVersionId=90918173&cellId=2)

In [None]:
from sklearn import model_selection

def create_kfold_dataset(dataset, num_splits):
    # we create a new column called kfold and fill it with -1
    dataset["kfold"] = -1
    # the next step is to randomize the rows of the data
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    
    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(dataset))))
    
    # bin targets
    dataset.loc[:, "bins"] = pd.cut(dataset["score"], bins=num_bins, labels=False)
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=dataset, y=dataset.bins.values)):
        dataset.loc[v_, 'kfold'] = f
    
    # drop the bins column
    dataset = dataset.drop("bins", axis=1)

    # return dataframe with folds
    return dataset.copy()

In [None]:
train.head()

In [None]:
train = create_kfold_dataset(train.copy(), num_splits=5)
train.head()

In [None]:
train.kfold.value_counts()

# Intializing the tokenizer
**We use the deberta v2 large model from huggingface**

In [None]:
# Intializing the deberta tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer

In [None]:
# The tokenized output shown below
encoded_input = tokenizer(train['target'][0])
print(encoded_input)

In [None]:
# Saving the model in the output folder
#tokenizer.save_pretrained("./roberta_base/")

In [None]:
train.head()

# Creating Phrase dataset for the model training

In [None]:
def create_data(id_, anchor, target, code, title, score, tokenizer, max_len, train_status=True):
    input_ids = []
    attention_mask = []
    token_type_ids = []
    labels = []
    ids = []
    encoded_sent = tokenizer.batch_encode_plus([(k[0] + " " + k[2], k[1]) for k in zip(anchor,target,title)],\
                                                max_length = max_len,\
                                                padding = "max_length", truncation = True)
    for i in range(len(anchor)): 
        ids.append(id_[i])
        input_ids.append(encoded_sent['input_ids'][i])
        attention_mask.append(encoded_sent['attention_mask'][i])
        if train_status:
            labels.append(score[i])
    return {"input_ids":input_ids,
            "attention_mask":attention_mask,
            "ids":ids, 
            }, labels

# Creating the training Model
We first create a basic deep learning model. More explaination will be given later.

In [None]:
import tensorflow as tf

def build_model(model_name, MAX_LEN):
    
    input__ids = tf.keras.Input(shape=(MAX_LEN, ), dtype = tf.int32)
    input__mask = tf.keras.Input(shape=(MAX_LEN, ), dtype = tf.int32)
    
    transformer = TFAutoModel.from_pretrained(model_name, trainable=True)
    x = transformer(input_ids = input__ids, 
                      attention_mask = input__mask)
    #print(x.last_hidden_state)
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)
    x = tf.keras.layers.Dropout(0.3)(x)

    fnl = tf.keras.layers.Dense(1)(x)
    
    model = tf.keras.Model(inputs = [input__ids, input__mask], 
                           outputs = fnl)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=LR), loss = "mse")
    return model

# Creating the validation and training set for model training purpose

In [None]:
fld = 0

train_data, train_labels = create_data(train[train.kfold != fld]['id'].tolist(), 
                                       train[train.kfold != fld]['anchor'].tolist(), 
                                       train[train.kfold != fld]['target'].tolist(),
                                       train[train.kfold != fld]['code'].tolist(),
                                       train[train.kfold != fld]['title'].tolist(), 
                                       train[train.kfold != fld]['score'].tolist(), 
                                       tokenizer, max_len, train_status=True)

val_data, val_labels = create_data(train[train.kfold == fld]['id'].tolist(), 
                                       train[train.kfold == fld]['anchor'].tolist(), 
                                       train[train.kfold == fld]['target'].tolist(),
                                       train[train.kfold == fld]['code'].tolist(),
                                       train[train.kfold == fld]['title'].tolist(), 
                                       train[train.kfold == fld]['score'].tolist(), 
                                       tokenizer, max_len, train_status=True)


# Creating the callback Pearson Correlation coefficient as a tracking metrics for the model 

In [None]:
from scipy.stats import pearsonr

class PearsonCallback(tf.keras.callbacks.Callback):
    def __init__(self, val_data):
    #    pass
        #print(dir(self.model))
        self.X_val, self.Y_val = val_data
    #def on_epoch_start(self,epoch):
    #    print(f"Learning rate: {self.model.optimize.learning_rate}")
    def on_epoch_end(self, epoch, logs):
        X_val_preds = self.model.predict(self.X_val)
        #print(X_val_preds.shape,self.Y_val.shape)
        pearson_corr = pearsonr(X_val_preds.ravel(), self.Y_val)
        print("pearsonr_val (from log) =", pearson_corr[0])
        logs["val_pearsonr"] = pearson_corr[0]

In [None]:
import matplotlib.pyplot as plt

def scheduler(epoch):
    learning_rate = LR
    if epoch == 0:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.8**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

plt.plot([scheduler(e) for e in range(NUM_EPOCHS)])

In [None]:
callback_save = tf.keras.callbacks.ModelCheckpoint('deberta_large_patent.h5', monitor='val_pearsonr',\
                                                   verbose=1, save_best_only=True,\
                                                   save_weights_only=True, mode='max',\
                                                   save_freq='epoch')
callback_es = tf.keras.callbacks.EarlyStopping(monitor='val_pearsonr', patience=2, mode='max',\
                                               verbose=1, restore_best_weights=True)
lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_pearsonr',factor=np.sqrt(0.1),\
                                                  cooldown=0, patience=2, min_lr=0.5e-9,mode='max')

In [None]:
model = build_model(model_name, max_len)

# Fitting the model to the training dataset

In [None]:
print(model.summary())

In [None]:
val_data_ = ((np.asarray(val_data['input_ids']),
              np.asarray(val_data['attention_mask']),
             ),
             np.asarray(val_labels).ravel())

model.fit((np.asarray(train_data['input_ids']),
           np.asarray(train_data['attention_mask']),
          ),
          np.asarray(train_labels).ravel(), 
        epochs = NUM_EPOCHS,
        shuffle=True,
        callbacks = [callback_lr, PearsonCallback(val_data_), callback_save, callback_es],
        batch_size = batch_size,
        validation_data= val_data_
       )

# model.fit((np.asarray(train_data['input_ids']),
#            np.asarray(train_data['attention_mask']),
#           ),
#           np.asarray(train_labels).ravel(), 
#         epochs = NUM_EPOCHS,
#         shuffle=True,
#         callbacks = [lr_reducer, callback_save, PearsonCallback(val_data_)],
#         batch_size = batch_size,
#         validation_data= val_data_
#        )

# Predicting the test set results

In [None]:
test_data, test_labels = create_data(test['id'].tolist(),\
                                   test['anchor'].tolist(),\
                                   test['target'].tolist(),\
                                   test['code'].tolist(),\
                                   test['title'].tolist(),\
                                   None, tokenizer, max_len,\
                                   train_status=False)

In [None]:
test_preds = model.predict((np.asarray(test_data['input_ids']),
                            np.asarray(test_data['attention_mask']),
                           #np.asarray(test_data['token_type_ids']),
                           ))

In [None]:
test_preds

# Creating the submission file

In [None]:
submission = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission['score'] = test_preds
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)
submission.to_csv('submission.csv',index=False)
submission