# U.S. Patent Phrase to Phrase - Sentence-T5-Base Baseline

This notebook uses the pretrained `s5-base` model to encode the "sentences" (anchors and targets) and calculate the cosine similarity score between the anchor and the target.

* The [st5-large baseline](https://www.kaggle.com/code/ceshine/sentence-t5-large-baseline-no-training).
* The [st5-3b baseline](https://www.kaggle.com/code/ceshine/sentence-t5-3b-baseline-no-training/notebook).

Adapted from this notebook: [U.S. Patent Phrase to Phrase Matching with TFHub](https://www.kaggle.com/code/lonnieqin/u-s-patent-phrase-to-phrase-matching-with-tfhub).

In [None]:
!pip install --no-deps ../input/tensorflow-text-260/tensorflow_text-2.6.0-cp37-cp37m-manylinux1_x86_64.whl

## Import Packages

In [None]:
import os

import numpy as np
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as tf_hub
import tensorflow_text as text  # Registers the ops.

from scipy import stats
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K

## Import datasets

In [None]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
codes = codes.rename(columns = {"code" : "context"})
train=pd.merge(train,codes[["context","title"]],on="context",how="left")
# train["score"] = train["score"] * 4
test=pd.merge(test,codes[["context","title"]],on="context",how="left")
train["title"] = train["title"].apply(lambda item: item.lower())
test["title"] = test["title"].apply(lambda item: item.lower())

In [None]:
train.head()

## Distribution of score

In [None]:
train["score"].value_counts().sort_index().plot(kind="bar")

In [None]:
num_samples = len(train)
print(f"Number of Samples: {num_samples}")

In [None]:
train.head()

## Modeling

In [None]:
hub_url = "../input/st5-base/"
encoder = tf_hub.KerasLayer(hub_url)

In [None]:
batch_size=128

def _get_norm(x):
    return (x * x).sum(axis=1) ** 0.5

def make_inference(df):
    cache_a, cache_b = [], []
    for i in tqdm(range(0, df.shape[0], batch_size)):
        # cache.append(encoder(timeline.text.iloc[i:(i+batch_size)]).numpy())
        cache_a.append(encoder(tf.constant(df.anchor.iloc[i:(i+batch_size)].tolist()))[0].numpy())
        cache_b.append(encoder(tf.constant(df.target.iloc[i:(i+batch_size)].tolist()))[0].numpy())
    vectors_a = np.concatenate(cache_a, axis=0)
    vectors_b = np.concatenate(cache_b, axis=0)
    scores = (vectors_a * vectors_b).sum(axis=1) / _get_norm(vectors_a) / _get_norm(vectors_b)
    return scores

In [None]:
train_scores = make_inference(train)

In [None]:
def evaluate_model(y_true, y_pred):
    pearson_score = stats.pearsonr(y_true, y_pred)[0]
    accuracy = np.mean(y_true == y_pred)
    return {
        "pearson": pearson_score,
        "accuracy": accuracy
    }

def visualize_metrics(metrics):
    metric_df = pd.DataFrame(metrics)
    metric_df.plot(kind="bar")
    plt.title("Pearson Correlation and Accuracy in different folds")
    plt.show()

    plt.title("Mean Pearson Correlation and Accuracy")
    metric_df.mean().plot(kind="bar")
    plt.show()

In [None]:
evaluate_model(train.score, train_scores)

## Submission

In [None]:
test_scores = make_inference(test)

In [None]:
test_scores

In [None]:
submission["score"] = test_scores
submission.to_csv("submission.csv", index=False)
submission.head()