**Training notebook can be found here:** [US Phrase Matching: TF-Keras Train [TPU]](https://www.kaggle.com/mohamadmerchant/us-phrase-matching-tf-keras-train-tpu)

In [None]:
import os
import random
import numpy as np
import pandas as pd
from scipy import stats
import warnings
warnings.filterwarnings("ignore") 

from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow_addons as tfa

import transformers
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
DATASET_PATH = "../input/us-patent-phrase-to-phrase-matching/"

test = pd.read_csv(DATASET_PATH + "test.csv")
sub = pd.read_csv(DATASET_PATH + "sample_submission.csv")

## Config

In [None]:
class Config():
    seed = 42
    epochs = 10
    num_folds = 5
    max_length = 128
    batch_size = 64
    learning_rate = 2e-5
    weight_decay = 0.01
    base_model = "../input/bert-for-patents/bert-for-patents/"
    bb_model = "../input/bigbirdpegasus/"
    #tokenizer = transformers.AutoTokenizer.from_pretrained(base_model)
    #tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
    tokenizer = transformers.AutoTokenizer.from_pretrained(bb_model)

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed=42)

In [None]:
def encode_text(text_pairs, 
                tokenizer,
                max_length):
    
    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text_pairs,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

## Build the model

In [None]:
def build_model(config):
    # Create the model under a distribution strategy scope.
    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():
        # Encoded token ids from BERT tokenizer.
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )
        # Attention masks indicates to the model which tokens should be attended to.
        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_masks"
        )
        # Token type ids are binary masks identifying different sequences in the model.
        token_type_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
        )
        # Loading pretrained BERT model.
        base_model = transformers.TFAutoModel.from_pretrained(config.base_model, from_pt=True)

        base_model_output = base_model(
            input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
        )
        
        last_hidden_state = base_model_output.last_hidden_state
        
        x = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
        output = tf.keras.layers.Dense(1, activation="linear")(x)
        
        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=output
        )
        
    return model

## Predict Folds

In [None]:
def predict_folds(test, config):
    preds = []
    
    for fold in range(config.num_folds):
        print("*" * 25)
        print(f"Predicting fold: {fold+1}")

        # Clear keras session.
        K.clear_session()
        
        test_encoded =  encode_text(test[["anchor", "target"]].values.tolist(),
                                     tokenizer=config.tokenizer,
                                     max_length=config.max_length)
        # Dataloader.
        test_data = tf.data.Dataset.from_tensor_slices((test_encoded))
        
        # Disable AutoShard.
        options = tf.data.Options()
        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        test_data = test_data.with_options(options)

        test_data = (
                        test_data
                        .batch(config.batch_size)
                        .prefetch(tf.data.AUTOTUNE)
                    )

        # Build and Load the model.
        model = build_model(config)
        print('Loading best model weights...')
        model.load_weights(f'../input/us-patent-matching-models/model-{fold+1}.h5')
        
        preds.append(
                model.predict(test_data,
                              batch_size=config.batch_size,
                              verbose=1).reshape(-1)
                    )
        print("*" * 25)

    preds = np.mean(preds, axis=0)
    return preds

In [None]:
config = Config()
preds = predict_folds(test, config)
sub['score'] = preds
sub.head()

In [None]:
from scipy import stats

def get_score(y_true, y_pred):
    score = stats.pearsonr(y_true, y_pred)[0]
    print(score)
    return score

In [None]:
sub.to_csv('submission.csv', index=False)