In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
train.head(5)

In [None]:
train["target"][0]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
#add another helpful dataset
cpc_add = pd.read_csv("../input/cpc-codes/titles.csv")

In [None]:
cpc_add.head(2)

In [None]:
cpc_add.shape

In [None]:
train["score"].describe()

In [None]:
train["score"].value_counts(normalize = True)

In [None]:
train.columns, cpc_add.columns

In [None]:
#let's merge both the data
train = train.merge(cpc_add, left_on = "context", right_on = "code", how = "left")


In [None]:
train.head(2)

In [None]:
class Config():
    seed = 42
    epochs = 10
    num_folds = 5
    max_length = 192
    batch_size = 64
    learning_rate = 2e-5
    weight_decay = 0.01
    base_model = "AI-Growth-Lab/PatentSBERTa"

In [None]:
from scipy import stats
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow_addons as tfa

import transformers

import warnings
warnings.filterwarnings("ignore") 

In [None]:
# Tokenizer.
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.base_model)

In [None]:


# Context tokens. 
train['context_token'] = '[' + train.context + ']'
#train['sep_token'] = '[SEP]'
#train['cls_token'] = '[CLS]'
context_tokens = list(train.context_token.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

# Preparing input text for the model.
# We are adding context_token before the context title
# to let model learn the context of anchor and target.
train['text'] = '[CLS]' + \
                    train['context_token'] + train['title'] + \
                    '[SEP]' + train['anchor'] + \
                    '[SEP]' + train['target'] + \
                '[SEP]'


In [None]:
train.head(2)

In [None]:
def encode_text(text, 
                tokenizer,
                max_length):
    
 
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=False,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

In [None]:
try:
    # TPU config
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync
    print(f'TPU: {tpu.master()}')
except:
    strategy = tf.distribute.get_strategy()
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync

# XLA acceleartion
tf.config.optimizer.set_jit(True)
print(f'Replicas: {replicas}')

In [None]:
def build_model(config, num_train_steps):
    # Create the model under a distribution strategy scope.
    with strategy.scope():
        # Encoded token ids from BERT tokenizer.
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )
        # Attention masks indicates to the model which tokens should be attended to.
        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_masks"
        )
        # Token type ids are binary masks identifying different sequences in the model.
        token_type_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
        )
        # Loading pretrained BERT model.
        base_model = transformers.TFAutoModel.from_pretrained(config.base_model, from_pt=True)

        base_model_output = base_model(
            input_ids, attention_mask=attention_masks)
        
        last_hidden_state = base_model_output.last_hidden_state
        avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
        dropout = tf.keras.layers.Dropout(0.3)(avg_pool)

        output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)
        
        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=output
        )

        model.compile(
            optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate),
            loss=tf.keras.losses.BinaryCrossentropy()
        )

    return model

In [None]:
def train_folds(train, config):
    oof = np.zeros(len(train))
    
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    
    skf = StratifiedKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['score_map'])):
        print("*" * 25)
        print(f"Training fold: {fold+1}")

        train_df = train.loc[train_idx].reset_index(drop=True)
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        # Clear keras session.
        K.clear_session()
        
        train_encoded =  encode_text(train_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        
        val_encoded =  encode_text(val_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        # Dataloader.
        train_data = tf.data.Dataset.from_tensor_slices((train_encoded, train_df['score'].tolist()))
        val_data = tf.data.Dataset.from_tensor_slices((val_encoded, val_df['score'].tolist()))

        train_data = (
                        train_data
                        .shuffle(1024)
                        .batch(config.batch_size)
                        .prefetch(tf.data.AUTOTUNE)
                     )
        
        val_data = (
                        val_data
                        .batch(config.batch_size)
                        .prefetch(tf.data.AUTOTUNE)
                    )

        # Callbacks.
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=3,
                                                          verbose=1)
        
        pearsonr_callback = Pearsonr(val_data, val_df['score'].values)
        num_train_steps = int(len(train_df) / config.batch_size * config.epochs)
        
        # Build and Train model.
        model = build_model(config, num_train_steps)
        history = model.fit(
                        train_data,
                        validation_data=val_data,
                        epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping, 
                                   pearsonr_callback],
                        verbose=1
                    )
        
        print('\nLoading best model weights...')
        model.load_weights(f'model-{fold+1}.h5')
        
        print('Predicting OOF...')
        oof[val_idx] = model.predict(val_data,
                                     batch_size=config.batch_size,
                                     verbose=0).reshape(-1)
        
        
        score = stats.pearsonr(val_df['score'].values, oof[val_idx])[0]
        print(f'\nFold {fold + 1}: OOF pearson_r: {score:.4f}')        
        print("*" * 25)
        
    score = stats.pearsonr(train['score'].values, oof)[0]
    print(f'\nOverall OOF pearson_r: {score:.4f}')
    return oof


In [None]:
class Pearsonr(tf.keras.callbacks.Callback):
    def __init__(self, val_data, y_val):
        self.val_data = val_data
        self.y_val = y_val
    def on_epoch_end(self, epoch, logs):
        val_preds = self.model.predict(self.val_data, verbose=0)
        
        val_pearsonr = stats.pearsonr(self.y_val, val_preds.ravel())[0]

        print(f"val_pearsonr: {val_pearsonr:.4f}\n")
        logs["val_pearsonr"] = val_pearsonr

In [None]:
config = Config()
oof_preds = train_folds(train, config)

In [None]:
np.save('oof.npy', oof_preds)