In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Datasets upload

In [None]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
print(train.shape)
train.head()

In [None]:
train.describe(include='object')

The purpose of the competition "U.S. Patent Phrase to Phrase Matching " is to predict similarity score between phrases. [Data Description](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/data?select=train.csv)

The scores are in the 0-1 range with increments of 0.25 with the following meanings:

    1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
    0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
    0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
    0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
    0.0 - Unrelated.


I create a small dataset for future visualisation purposes

In [None]:
train_small = train.iloc[:5]
train_small

# Test Corpus

a small test corpus is provided to be able to make a submission of results

In [None]:
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
test.shape

In [None]:
test.head()

## Imports

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def cosine(u, v):
    """
    cosine similarity definition
    """
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

# USE

The Universal Sentence Encoder (USE) is a simple way to get first results.

In [None]:
# for online version
#module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
# for offline version
#module_url = "../input/universalsentenceencoder4" # error
module_url = "../input/universalsentenceencodertfv4"
model = hub.load(module_url)
sentences = train["anchor"].values.tolist()
queries = train["target"].values.tolist()

In [None]:
model([sentences[0]])

## Cosine similarity

In [None]:
cosine(model([sentences[0]])[0], model([queries[0]])[0])

In [None]:
train["score_f"] = [cosine(model([s])[0], model([q])[0]) for s, q in zip(sentences, queries)]
np.corrcoef(train["score_f"], train["score"])

In [None]:
exampq = train_small["target"].values.tolist()
exampq.append(train_small["anchor"][0])
exampq

In [None]:
def plot_similarity(labels, features, rotation):
    """
    heat map for phrases
    """
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
    corr,
    xticklabels=labels,
    yticklabels=labels,
    vmin=0,
    vmax=1,
    cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")


In [None]:
plot_similarity(exampq, model(exampq), 90)

We see here that even if theses phrases belong to the same anchor, they are not all equally correllated between themselves. The USE similarity manages to discover the general tendencies.

In [None]:
train.head()

# Submission file creation

In [None]:
submission = test['id'].copy()

In [None]:
submission.name = 'id'

In [None]:
submission

In [None]:
df_submission = pd.DataFrame()

In [None]:
df_submission['id'] = submission

In [None]:
df_submission

In [None]:
sentences_t = test["anchor"].values.tolist()
queries_t = test["target"].values.tolist()

In [None]:
df_submission['score' ] = [cosine(model([s])[0], model([q])[0]) for s, q in zip(sentences_t, queries_t)]

In [None]:
type(df_submission)

In [None]:
df_submission

In [None]:
df_submission.to_csv('submission.csv', index=False)