# U.S. Patent Phrase to Phrase Matching with TFHub
In this competition, I will build a U.S. Patent Phrase to Phrase Matching Model using TFHub. I will treat this  problem as Multi-Class Classification Problem.


## Import Packages

In [None]:
import numpy as np
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as tf_hub

from scipy import stats
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K

## Import datasets

In [None]:
class Config:
    is_training = False
    model_path = "../input/uspppm-tfhub/"
config = Config()

In [None]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
codes = codes.rename(columns = {"code" : "context"})
train=pd.merge(train,codes[["context","title"]],on="context",how="left")
test=pd.merge(test,codes[["context","title"]],on="context",how="left")
train["title"] = train["title"].apply(lambda item: item.lower())
test["title"] = test["title"].apply(lambda item: item.lower())

In [None]:
train.head()

## Distribution of score

In [None]:
train["score"].value_counts().plot(kind="bar")

In [None]:
num_samples = len(train)
print(f"Number of Samples: {num_samples}")

In [None]:
train.head()

## Modeling

In [None]:
if config.is_training == True:
    import tensorflow_hub as hub
    base_model = keras.Sequential([
        hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2"),
    ])

In [None]:
def make_dataset(df, mode="train", batch_size=256):
    ds = tf.data.Dataset.from_tensor_slices(((df["anchor"], df["target"], df["title"]), df["score"]))
    if mode == "train":
        ds = ds.shuffle(batch_size)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds


def correlation(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum(tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum(tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum((x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return corr

    
def get_model():
    anchor_inputs = keras.Input((), dtype=tf.string)
    target_inputs = keras.Input((), dtype=tf.string)
    title_inputs = keras.Input((), dtype=tf.string)
    
    anchor_vector = base_model(anchor_inputs)
    target_vector = base_model(target_inputs)
    title_vector = base_model(title_inputs)
    vector = keras.layers.Concatenate()([anchor_vector, target_vector, title_vector])
    vector = keras.layers.Flatten()(vector)
    vector = keras.layers.Dense(32, activation="swish")(vector)
    vector = keras.layers.Dropout(0.3)(vector)
    vector = keras.layers.Dense(32, activation="swish")(vector)
    vector = keras.layers.Dropout(0.3)(vector)
    vector = keras.layers.Dense(32, activation="swish")(vector)
    vector = keras.layers.Dropout(0.3)(vector)
    output = keras.layers.Dense(1, activation="sigmoid")(vector)
    model = keras.Model(inputs=[anchor_inputs, target_inputs, title_inputs], outputs=[output])
    model.compile(loss=keras.losses.BinaryCrossentropy(), optimizer="adam", metrics=["accuracy", correlation])
    return model

In [None]:
if config.is_training == True:
    model =get_model()
    model.summary()
    keras.utils.plot_model(model, show_shapes=True)
else:
    model = keras.models.load_model(f"{config.model_path}model_0.tf", custom_objects={"correlation": correlation})
    model.summary()
    keras.utils.plot_model(model, show_shapes=True, to_file='model.png')

In [None]:
def evaluate_model(y_true, y_pred):
    pearson_score = stats.pearsonr(y_true, y_pred)[0]
    accuracy = np.mean(y_true == y_pred)
    return {
        "pearson": pearson_score,
        "accuracy": accuracy
    }

def visualize_metrics(metrics):
    metric_df = pd.DataFrame(metrics)
    metric_df.plot(kind="bar")
    plt.title("Pearson Correlation and Accuracy in different folds")
    plt.show()

    plt.title("Mean Pearson Correlation and Accuracy")
    metric_df.mean().plot(kind="bar")
    plt.show()

In [None]:
%%time
from sklearn.model_selection import KFold
kfold = KFold(5, shuffle=True, random_state=42)
train["group"] = train["score"] * 4
train["group"] = train["group"].astype(np.uint8)
models = []
metrics = []
for i, (train_indices, valid_indices) in enumerate(kfold.split(train, train["group"])):
    train_df = train.iloc[train_indices]
    valid_df = train.iloc[valid_indices]
    valid_ds = make_dataset(valid_df, mode="valid")
    
    if config.is_training:
        model = get_model()
        checkpoint = keras.callbacks.ModelCheckpoint(f"model_{i}.tf", monitor="val_accuracy", mode="max", save_best_only=True)
        early_stop = keras.callbacks.EarlyStopping(patience=10)
        train_ds = make_dataset(train_df)
        history = model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
        pd.DataFrame(history.history).plot()
        plt.show()
    else:
        model = keras.models.load_model(f"{config.model_path}model_{i}.tf", custom_objects={"correlation": correlation})
    y_pred = np.argmax(model.predict(valid_ds), axis=-1).reshape(-1)
    y_true = valid_df["score"]
    metric = evaluate_model(y_true, y_pred)
    models.append(model)
    metrics.append(metric)

## Visualize metrics

In [None]:
visualize_metrics(metrics)

## Submission

In [None]:
def preprocess_test(a, b, c):
    return (a, b, c), 0
def make_test_dataset(df, batch_size=256):
    ds = tf.data.Dataset.from_tensor_slices((df["anchor"], df["target"], df["title"]))
    ds = ds.map(preprocess_test).batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
test_ds = make_test_dataset(test)

In [None]:
y_pred = np.mean([model.predict(test_ds) for model in models], axis=0)

In [None]:
submission["score"] = y_pred
submission.to_csv("submission.csv", index=False)
submission.head()