# Introduction

US Patent Phrase to Phrase Matching Competition challenges the participants to use innovative semantic similarity techniques to identify similar claims in the set of data.

# Analysis Preparation

Let's first import some Python packages and read the data.

In [None]:
import pandas as pd
import numpy as np
import os
import operator
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS

warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
sample_submission_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

# Preliminary Data Exploration

## Glimpse the data

In [None]:
train_df.head(2)

In [None]:
test_df.head(2)

In [None]:
sample_submission_df.head(2)

## Data shape

In [None]:
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Sample submission data shape: {sample_submission_df.shape}")

## Missing data

In [None]:
print(f"Missing data (train/anchor): {train_df.loc[train_df.anchor.isna()].shape[0]}")
print(f"Missing data (train/target): {train_df.loc[train_df.target.isna()].shape[0]}")
print(f"Missing data (train/context): {train_df.loc[train_df.context.isna()].shape[0]}")
print(f"Missing data (test/anchor): {test_df.loc[test_df.anchor.isna()].shape[0]}")
print(f"Missing data (test/target): {test_df.loc[test_df.target.isna()].shape[0]}")
print(f"Missing data (test/context): {test_df.loc[test_df.context.isna()].shape[0]}")

## Unique data

In [None]:
print(f"Unique data (train/anchor): {train_df.anchor.nunique()} ({round(train_df.anchor.nunique()/train_df.shape[0]*100, 1)}%)")
print(f"Unique data (train/target): {train_df.target.nunique()} ({round(train_df.target.nunique()/train_df.shape[0]*100, 3)}%)")
print(f"Unique data (train/context): {train_df.context.nunique()} ({round(train_df.context.nunique()/train_df.shape[0]*100, 3)}%)")
print(f"Unique data (test/anchor): {test_df.anchor.nunique()} ({round(test_df.anchor.nunique()/test_df.shape[0]*100, 3)}%)")
print(f"Unique data (test/target): {test_df.target.nunique()} ({round(test_df.target.nunique()/test_df.shape[0]*100, 3)}%)")
print(f"Unique data (test/context): {test_df.context.nunique()} ({round(test_df.context.nunique()/test_df.shape[0]*100, 3)}%)")

# Advanced Data Exploration


We will try to answer few questions about the data distribution, frequent words, frequent n-grams, possbile duplicates, words and phrases length.

## Which anchor phrases have most of target candidates?

In [None]:
g_train_df = train_df.groupby(["anchor"])["target"].count().reset_index().sort_values(["target"], ascending=False)
g_train_df.head(10)

Let's also plot the distribution of number of target candidates per anchors.

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of anchors with a certain number of targets in the train set")
sns.distplot(g_train_df['target'],kde=False,hist=True, bins=120, label='anchors')
plt.ylabel("Anchors / number of targets")
plt.legend(); plt.show()

## How many anchor phrases have only one target candidate?

In [None]:
g_train_df.loc[g_train_df.target==1].head(5)

It appears that there are only two anchor phrases that have only one target candidate.

## Is there any target that appears as well as an anchor?

In [None]:
anchor_target_df = train_df.loc[train_df.target.isin(list(train_df.anchor.unique()))]
anchors = anchor_target_df.anchor.unique()
targets = anchor_target_df.target.unique()
anchors_targets = set(anchors) & set(targets)
anchors_not_targets = set(anchors) - set(targets)
targets_not_anchors = set(targets) - set(anchors)
print("Anchors: ", len(anchors), " Targets: ", len(targets), " Anchors & Targets: ", len(anchors_targets))
print("Anchors that are not targets: ",  anchors_not_targets)
print("Targets that are not anchors: ", targets_not_anchors)

Let's take a look to these anchors that appear as well as targets.

In [None]:
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=400,
        max_font_size=40, 
        scale=12,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(16,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(anchors_targets, title = '')

Let's also see in how many rows appears these targets.

In [None]:
print(f"Train data with target that appears in anchors: {train_df.loc[train_df.target.isin(anchors_targets)].shape[0]}")

But are these all cases with identity score 1.0?

In [None]:
df = train_df.loc[train_df.target.isin(anchors_targets) & train_df.score==1]
print(df.shape[0])

Let's see what are those cases when the score is not 1.

In [None]:
print("Data with score = 1.0 and for which target is one of the anchors: ", df.loc[df.score==1].shape[0])
print("All data with score == 1.0: ", train_df.loc[train_df.score==1].shape[0])

## What is the distribution of word count and character count?

In [None]:
train_df["anchor_char_count"] = train_df["anchor"].apply(lambda x: len(x))
train_df["target_char_count"] = train_df["target"].apply(lambda x: len(x))
train_df["anchor_word_count"] = train_df["anchor"].apply(lambda x: len(x.split(" ")))
train_df["target_word_count"] = train_df["target"].apply(lambda x: len(x.split(" ")))

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of anchors and target char count")
sns.distplot(train_df['anchor_char_count'],kde=True,hist=False, bins=120, label='anchor char count')
sns.distplot(train_df['target_char_count'],kde=True,hist=False, bins=120, label='target char count')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of anchors and target word count")
sns.distplot(train_df['anchor_word_count'],kde=True,hist=False, bins=120, label='anchor word count')
sns.distplot(train_df['target_word_count'],kde=True,hist=False, bins=120, label='target word count')
plt.legend(); plt.show()

## How many targets have zero similarity score with anchors?

In [None]:
print("Data with zero similarity: ", train_df.loc[train_df.score==0].shape[0])

Let's look to some examples of this data:

In [None]:
train_df.loc[train_df.score==0].sample(10).head(10)

It's interesting to notice that the phrases with zero matching score have quite frequent common words. Most probably, if we will use some sort of words similarity, without semantic context, the performance of a model will not be extraordinary. We will explore this further in the next sections.

## How many contexts and how many anchor / context ?

In [None]:
print(f"Unique contexts: {train_df.context.nunique()}")

In [None]:
anchor_train_df = train_df[["context", "anchor"]].drop_duplicates()
print(anchor_train_df.shape[0])
grouped_df = anchor_train_df.groupby(["context"])["anchor"].count().reset_index().sort_values(["anchor"], ascending=False)
grouped_df.head(5)

In [None]:
grouped_df.tail(5)

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of anchor count/context")
sns.distplot(grouped_df["anchor"],kde=False,hist=True, bins=80, label=None, color="red")
plt.legend(); plt.show()

## How many many targets / context?

In [None]:
grouped_df = train_df.groupby(["context"])["target"].count().reset_index().sort_values(["target"], ascending=False)
grouped_df.head(5)

In [None]:
grouped_df.tail(5)

Let's also show the distribution of number of targets / context.

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of target count/context")
sns.distplot(grouped_df["target"],kde=False,hist=True, bins=80, label=None, color='green')
plt.legend(); plt.show()

## How many context areas and how many anchor/context area?

In [None]:
train_df["context_area"] = train_df["context"].apply(lambda x: str(x)[0])
print(f"Context areas count: {train_df.context_area.nunique()}")
print(f"Context areas: {train_df.context_area.unique()}")

In [None]:
anchor_train_df = train_df[["context_area", "anchor"]].drop_duplicates()
print(f"Context area + anchor: {anchor_train_df.shape[0]}")
grouped_df = anchor_train_df.groupby(["context_area"])["anchor"].count().reset_index().sort_values(["anchor"], ascending=False)
grouped_df.head(10)

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of anchor count/context area")
sns.distplot(grouped_df["anchor"],kde=False,hist=True, bins=80, label=None, color='magenta')
plt.legend(); plt.show()

## What means the A-Y codes?

We can provide the specification of the high level **A** to **Y** codes.

In [None]:
table = [
["A", "Human Necessities"],
["B", "Operations and Transport"],
["C", "Chemistry and Metallurgy"],
["D", "Textiles"],
["E", "Fixed Constructions"],
["F", "Mechanical Engineering"],
["G", "Physics"],
["H", "Electricity"],
["Y", "Emerging Cross-Sectional Technologies"]]
table

In [None]:
table_context_df = pd.DataFrame(table)
table_context_df

## Deep dive on the meaning of indexes

Let's deep dive on the meaning of indexes. We will load an additional dataset with the information.

In [None]:
cpc_codes_df = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")

In [None]:
cpc_codes_df.shape

In [None]:
cpc_codes_df["section"].value_counts()

In [None]:
cpc_codes_df.head()

In [None]:
cpc_main_codes_df = cpc_codes_df.loc[cpc_codes_df["class"].isna()]
cpc_main_codes_df

Let's show some wordclouds with the description of the groups (title field). We will represent overall, H, A, B, C.

In [None]:
show_wordcloud(cpc_codes_df["title"], title = '')

In [None]:
show_wordcloud(cpc_codes_df.loc[cpc_codes_df["section"]=="H", "title"], title = '')

In [None]:
show_wordcloud(cpc_codes_df.loc[cpc_codes_df["section"]=="A", "title"], title = '')

In [None]:
show_wordcloud(cpc_codes_df.loc[cpc_codes_df["section"]=="B", "title"], title = '')

In [None]:
show_wordcloud(cpc_codes_df.loc[cpc_codes_df["section"]=="C", "title"], title = '')

## How many targets/context area?

In [None]:
grouped_area_df = train_df.groupby(["context_area"])["target"].count().reset_index().sort_values(["target"], ascending=False)
grouped_area_df

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of target count/context area")
sns.distplot(grouped_area_df["target"],kde=False,hist=True, bins=10, label=None, color="orange")
plt.legend(); plt.show()

## How many context / context area?

In [None]:
grouped_area_ct_df = train_df.groupby(["context_area"])["context"].count().reset_index().sort_values(["context"], ascending=False)
grouped_area_ct_df

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of context count/context area")
sns.distplot(grouped_area_ct_df["context"],kde=False,hist=True, bins=10, label=None, color="darkgreen")
plt.legend(); plt.show()

# Is This a Model?

Let's explore some ways to calculate a score of similarity between anchor and target. We will start with very simple ones, based on levenshtein distance, word embeddings cosine simmilarity.


## Levenshtein distance 

In [None]:
from fuzzywuzzy import fuzz
def fuzzy_similarity(r):
    return fuzz.ratio(r.anchor, r.target) / 100.
train_df["fuzzy_simmilarity_score"] = train_df.apply(fuzzy_similarity, axis=1)
train_df.head(2)

Let's also evaluate error of the estimate using this method.

In [None]:
def compute_metrics(predictions, reference):
    return np.corrcoef(predictions, reference)[0][1]

In [None]:
fuzzy_sim_score = compute_metrics(train_df["fuzzy_simmilarity_score"].values, train_df["score"].values)
print(fuzzy_sim_score)

Let's see the validation score if we round the score by fuzzy similarity to values of 0, 0.25, 0.5, 0.75, 1.0.

In [None]:
train_df["fuzzy_simmilarity_score_round"] = round(train_df["fuzzy_simmilarity_score"] * 4.0, 0) / 4.0

In [None]:
fuzzy_sim_score_round = compute_metrics(train_df["fuzzy_simmilarity_score_round"].values, train_df["score"].values)
print(fuzzy_sim_score_round)

The first approximation is better.

## Word embeddings similarity

### Create vocabulary 

In [None]:
def build_vocabulary(texts):
    """
    credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings 
    credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model
    input: list of list of words
    output: dictionary of words and their count
    """
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
# populate the vocabulary
df = pd.concat([train_df ,test_df], sort=False)
vocabulary_anchor = build_vocabulary(df['anchor'])
vocabulary_target = build_vocabulary(df['target'])

In [None]:
# display the first 10 elements in anchor vocabulary and their count
print({k: vocabulary_anchor[k] for k in list(vocabulary_anchor)[:10]})

In [None]:
# display the first 10 elements in target vocabulary and their count
print({k: vocabulary_target[k] for k in list(vocabulary_target)[:10]})

### Load embeddings

In [None]:
def load_embeddings(file):
    """
    credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings 
    credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model
    input: embeddings file
    output: embedding index
    """
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

In [None]:
%%time
GLOVE_PATH = '/kaggle/input/glove840b300dtxt/'
print("Extracting GloVe embedding started")
embed_glove = load_embeddings(os.path.join(GLOVE_PATH,'glove.840B.300d.txt'))
print("Embedding completed")

In [None]:
len(embed_glove)

### Embeddings coverage

Let's check embeddings coverage

In [None]:
def check_coverage(vocab, embeddings_index):
    '''
    credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings 
    credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model
    input: vocabulary, embedding index
    output: list of unknown words; also prints the vocabulary coverage of embeddings and the % of comments text covered by the embeddings
    '''
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in tqdm(vocab.keys()):
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass
    print('Found embeddings for {:.3%} of vocabulary'.format(len(known_words)/len(vocab)))
    print('Found embeddings for {:.3%} of all text'.format(nb_known_words/(nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    return unknown_words

In [None]:
print("Verify the intial anchor vocabulary coverage")
oov_glove_anchor = check_coverage(vocabulary_anchor, embed_glove)
print("Verify the intial target vocabulary coverage")
oov_glove_target = check_coverage(vocabulary_target, embed_glove)

In [None]:
print("Anchor (most frequent) words not in embeddings:\n", oov_glove_anchor[:10])
print("Target (most frequent) words not in embeddings:\n", oov_glove_target[:10])

In [None]:
print("Anchor total words not in embeddings: ", len(oov_glove_anchor))
print("Target total words not in embeddings: ", len(oov_glove_target))

### Calculate similarity

Let's apply now the cosine distance to out anchor and target phrases.

In [None]:
EMBED_SIZE = 300

def embeddings(words, embeddings_index=embed_glove):
    embedding_matrix = np.zeros((len(words) + 1, EMBED_SIZE))
    for i, word in enumerate(words):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
import gensim
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(token)     
    return result

In [None]:
import scipy
def cosine_distance_wordembedding_method(r):
    try:
        words_1 = [word for word in preprocess(r.anchor)]
        words_2 = [word for word in preprocess(r.target)]
        vector_1 = np.mean(embeddings(words_1), axis = 0)
        vector_2 = np.mean(embeddings(words_2), axis = 0) 
        cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
        return round((1-cosine),3)
    except Exception as ex:
        # if exception (missing word in embeddings), default on fuzzy wuzzy ratio (similarity score)
        print(ex)
        return fuzz.ratio(r.anchor, r.target) / 100.

In [None]:
train_df["word_embeddings_simmilarity_score"] = train_df.apply(cosine_distance_wordembedding_method, axis=1)

In [None]:
train_df[["anchor", "target", "score", "fuzzy_simmilarity_score", "word_embeddings_simmilarity_score"]].head()

Let's compute the metrics for this score.

In [None]:
word_embeddings_simmilarity_score = compute_metrics(train_df["word_embeddings_simmilarity_score"].values, train_df["score"].values)
print(word_embeddings_simmilarity_score)

And also let's compute the metrics for this score (with the round).

In [None]:
train_df["word_embeddings_simmilarity_score_round"] = round(train_df["word_embeddings_simmilarity_score"] * 4.0, 0) / 4.0
word_embeddings_simmilarity_score_round = compute_metrics(train_df["word_embeddings_simmilarity_score_round"].values, train_df["score"].values)
print(word_embeddings_simmilarity_score_round)

We will use as a baseline submission the result for `word_embeddings_simmilarity_score`.

# Baseline Submission

Let's prepare a submission where we set the score based on word embeddings cosine similarity.

In [None]:
test_df["score"] = test_df.apply(cosine_distance_wordembedding_method, axis=1)

In [None]:
test_df.head()

In [None]:
sample_submission_df.head(2)

In [None]:
test_df[["id", "score"]].to_csv("submission.csv", index=False)