## Updates

* Added cpc_codes context text to the input.
* Changed optimizer back to Adam.
* Used droupout.
* Added Context tokens.
* Used BCE loss function.


**Mostly adapted from my original work on keras.io:** [Semantic Similarity with BERT](https://keras.io/examples/nlp/semantic_similarity_with_bert/)

**Inference notebook can be found here:** [US Phrase Matching: TF-Keras Inference](https://www.kaggle.com/code/mohamadmerchant/us-phrase-matching-tf-keras-inference)

**Main purpose of this notebook is to make faster and efficient experiments on large language models.**

**Of course better results can be acieved by further tuning and training.**

In [None]:
!pip install -U scikit-learn

In [None]:
import os
import random
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import StratifiedGroupKFold

import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow_addons as tfa

import transformers

import warnings
warnings.filterwarnings("ignore") 

## Dataset Overview

In [None]:
DATASET_PATH = "../input/us-patent-phrase-to-phrase-matching/"

train = pd.read_csv(DATASET_PATH + "train.csv")
test = pd.read_csv(DATASET_PATH + "test.csv")
sub = pd.read_csv(DATASET_PATH + "sample_submission.csv")

In [None]:
cpc_codes = pd.read_csv("../input/cpc-codes/titles.csv")
train = train.merge(cpc_codes, left_on='context', right_on='code', how='left')

In [None]:
# Shape of the data
print(f"Total train samples : {train.shape[0]}")
print(f"Total test samples: {sub.shape[0]}")

In [None]:
print(f"Train Score Distribution")
train['score'].value_counts(normalize=True)

## Setup TPU

In [None]:
try:
    # TPU config
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync
    print(f'TPU: {tpu.master()}')
except:
    strategy = tf.distribute.get_strategy()
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync

# XLA acceleartion
tf.config.optimizer.set_jit(True)
print(f'Replicas: {replicas}')

## Config

In [None]:
class Config():
    seed = 42
    epochs = 4
    num_folds = 4
    max_length = 156
    batch_size = 32
    learning_rate = 2e-5
    weight_decay = 0.01
    base_model = "google/electra-large-discriminator"
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed=42)

## Text Encoder

In [None]:
# Adding Context tokens to tokenizer.

train['title'] = train['title'].str.lower()
train['anchor'] = train['anchor'].str.lower()
train['target'] = train['target'].str.lower()

# Tokenizer.
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.base_model)

# Context tokens. 
train['context_token'] = '[' + train.context + ']'
train['sep_token'] = '[SEP]'
context_tokens = list(train.context_token.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

# Preparing input text for the model.
# We are adding context_token before the context title
# to let model learn the context of anchor and target.
train['text'] = train['anchor'] + train['sep_token'] + train['target'] + \
                train['context_token'] + train['title']

In [None]:
train.head()

In [None]:
def encode_text(text, 
                tokenizer,
                max_length):
    
    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        return_offsets_mapping=False,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
    }

## Competition Metrics

In [None]:
class Pearsonr(tf.keras.callbacks.Callback):
    def __init__(self, val_data, y_val):
        self.val_data = val_data
        self.y_val = y_val
    def on_epoch_end(self, epoch, logs):
        val_preds = self.model.predict(self.val_data, verbose=0)
        
        val_pearsonr = stats.pearsonr(self.y_val, np.clip(val_preds.ravel(), 0, 1))[0]

        print(f"val_pearsonr: {val_pearsonr:.4f}\n")
        logs["val_pearsonr"] = val_pearsonr

## Build the model

In [None]:
def build_model(config, num_train_steps):
    # Create the model under a distribution strategy scope.
    with strategy.scope():
        # Encoded token ids from BERT tokenizer.
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )
        # Attention masks indicates to the model which tokens should be attended to.
        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_masks"
        )

        # Loading pretrained BERT model.
        base_model = transformers.TFAutoModel.from_pretrained(config.base_model)

        base_model_output = base_model(
            input_ids, attention_mask=attention_masks
        )
        
        last_hidden_state = base_model_output.last_hidden_state
        avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
#         cls_token = base_model_output.pooler_output
        output = tf.keras.layers.Dense(1)(avg_pool)
        
        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks], outputs=output
        )

        # Linear scheduler.
        lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=config.learning_rate, 
            end_learning_rate=2e-6, 
            decay_steps=num_train_steps)
        
        model.compile(
            optimizer = tf.keras.optimizers.Adam(lr_scheduler),
            loss='mse'
        )

    return model

## Train Folds

In [None]:
def train_folds(train, config):
    oof = np.zeros(len(train))
    
    train["score_bins"] = pd.cut(train["score"], bins=5, labels=False)
    
    skf = StratifiedGroupKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, 
                                                          train['score_bins'].values,
                                                          train['anchor'].values)):
        print("*" * 25)
        print(f"Training fold: {fold+1}")

        train_df = train.loc[train_idx].reset_index(drop=True)
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        # Clear keras session.
        K.clear_session()
        
        train_encoded =  encode_text(train_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        
        val_encoded =  encode_text(val_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        # Dataloader.
        train_data = tf.data.Dataset.from_tensor_slices((train_encoded, train_df['score'].tolist()))
        val_data = tf.data.Dataset.from_tensor_slices((val_encoded, val_df['score'].tolist()))

        train_data = (
                        train_data
                        .shuffle(1024)
                        .batch(config.batch_size)
                        .prefetch(tf.data.AUTOTUNE)
                     )
        
        val_data = (
                        val_data
                        .batch(config.batch_size)
                        .prefetch(tf.data.AUTOTUNE)
                    )

        # Callbacks.
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=3,
                                                          verbose=1)
        
        pearsonr_callback = Pearsonr(val_data, val_df['score'].values)
        num_train_steps = int(len(train_df) / config.batch_size * config.epochs)
        
        # Build and Train model.
        model = build_model(config, num_train_steps)
        history = model.fit(
                        train_data,
                        validation_data=val_data,
                        epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping, 
                                   pearsonr_callback],
                        verbose=1
                    )
        
        print('\nLoading best model weights...')
        model.load_weights(f'model-{fold+1}.h5')
        
        print('Predicting OOF...')
        oof[val_idx] = model.predict(val_data,
                                     batch_size=config.batch_size,
                                     verbose=0).reshape(-1)
        
        
        score = stats.pearsonr(val_df['score'].values, oof[val_idx])[0]
        print(f'\nFold {fold + 1}: OOF pearson_r: {score:.4f}')        
        print("*" * 25)
        
    score = stats.pearsonr(train['score'].values, oof)[0]
    print(f'\nOverall OOF pearson_r: {score:.4f}')
    return oof

In [None]:
config = Config()
oof_preds = train_folds(train, config)

In [None]:
np.save('oof.npy', oof_preds)

In [None]:
!mkdir ./five-folds/
!cp -r ../us-patent-phrase-matching-models/* ./five-folds/*