# Read Data

In [None]:
import numpy as np
import pandas as pd
import re

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import transformers
from transformers import RobertaTokenizer, TFRobertaModel

transformers.logging.set_verbosity_error() # turn off the warnings
pd.set_option("display.max_columns", None) # expand the display of output

In [None]:
# constructing dataframes
train_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test_df  = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
train_df

In [None]:
# list of unique contexts
train_df["context"].unique()

In [None]:
cpc_codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
cpc_codes

In [None]:
cpc_codes = cpc_codes.rename(columns = {"code" : "context"})
train_df = pd.merge(train_df, cpc_codes[["context","title"]], on="context", how="left")
test_df = pd.merge(test_df, cpc_codes[["context","title"]], on ="context", how="left")
train_df

# Create Data

In [None]:
max_len = 128
train_df["title"] = train_df["title"].apply(lambda x : re.sub('[;,]', '', x)) # remove punctuation
print(train_df.loc[:500, "title"].unique())

In [None]:
train_df["anchor_title"] = train_df["anchor"].astype(str) + " " + train_df["title"].astype(str)
test_df["anchor_title"] = test_df["anchor"].astype(str) + " " + test_df["title"].astype(str)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("../input/roberta-base/") # using roberta tokenizer and pretrained model

In [None]:
def create_data(id_, anchor_title, target, score, train=True) :
    input_ids = []
    attention_mask = []
    labels = []
    ids = []
    # tokenize and prepare for the model a list of sequences or a list of pairs of sequences
    tok_txt = tokenizer.batch_encode_plus(
        [(word[0], word[1]) for word in zip(anchor_title, target)],                           
        max_length = max_len, 
        padding='max_length', # pad to a maximum length specified with the argument max_length
        truncation=True # can not output batch with sequence lengths greater than the model maximum admissible input size
    )    
    
    for i, _ in enumerate(anchor_title): 
        ids.append(id_[i])
        input_ids.append(tok_txt['input_ids'][i])
        attention_mask.append(tok_txt['attention_mask'][i])
        if train:
            labels.append(score[i])
            
    return {
        "input_ids":input_ids,
        "attention_mask":attention_mask,
        "ids":ids, 
    }, labels

In [None]:
# create train data
train_data, train_labels = create_data(train_df['id'], train_df['anchor_title'], 
                                       train_df['target'], train_df['score'], train=True)

In [None]:
# create test data
test_data, test_labels = create_data(test_df['id'], test_df['anchor_title'], 
                                     test_df['target'], None, train=False)

# Build Model

In [None]:
def build_model():
    model_ids = Input(shape=(max_len,), dtype = tf.int32)
    model_mask = Input(shape=(max_len,), dtype = tf.int32)
    roberta_model = TFRobertaModel.from_pretrained("../input/roberta-base/") # initializing roberta model
    
    x = roberta_model(input_ids = model_ids, 
                      attention_mask = model_mask)     
    # pooling operation that replaces fully connected layers
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state) # sequence of hidden-states at the output of the last layer of the model  
    
    outputs = Dense(1)(x) # 1, because we want to generate only one value with the Dense layer 
    model = tf.keras.Model(inputs=[model_ids, model_mask], outputs=outputs) # initializing our model
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mse"])
    return model

In [None]:
# defining scheduler and dynamic lr
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch == 0:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.9 ** epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
model = build_model()
model.fit(
    (
        np.array(train_data['input_ids']),
        np.array(train_data['attention_mask'])
    ),
    np.array(train_labels).ravel(), 
    epochs = 5, # number of epochs
    shuffle = True, # adding some random (shuffling the array)
    callbacks = [
        # stop training when a monitored metric has stopped improving
        EarlyStopping(monitor='val_mse', patience=3, restore_best_weights=True), 
        # callback to save the Keras model or model weights at some frequency
        ModelCheckpoint('roberta_uspppm.h5', monitor='val_mse', save_best_only=True, save_weights_only=True), 
        # updates learning rate every epoch
        callback_lr
    ],                     
    batch_size = 64, # size of one batch
    validation_split = 0.25 # proportion of splitting 
)

# Prediction

In [None]:
test_preds = model.predict((np.array(test_data['input_ids']),
                            np.array(test_data['attention_mask'])))

In [None]:
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission['score'] = test_preds
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)
submission.to_csv('submission.csv',index=False)
submission