# US Patent Phrase to Phrase maching - EDA plotly

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud,STOPWORDS
from termcolor import colored

colors = ["#A2A21C", "#CBCB1A", "#E1E10B", "#F6F605", "#838305"]

## Data Description
In this dataset, you are presented pairs of phrases (an anchor and a target phrase) and asked to rate how similar they are on a scale from 0 (not at all similar) to 1 (identical in meaning). This challenge differs from a standard semantic similarity task in that similarity has been scored here within a patent's context, specifically its CPC classification (version 2021.05), which indicates the subject to which the patent relates. For example, while the phrases "bird" and "Cape Cod" may have low semantic similarity in normal language, the likeness of their meaning is much closer if considered in the context of "house".

This is a code competition, in which you will submit code that will be run against an unseen test set. The unseen test set contains approximately 12k pairs of phrases. A small public test set has been provided for testing purposes, but is not used in scoring.

Information on the meaning of CPC codes may be found on the USPTO website. The CPC version 2021.05 can be found on the CPC archive website.

### Score meanings
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

* <b>1.0 - </b>Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
* <b>0.75 - </b>Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
* <b>0.5 - </b>Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
* <b>0.25 - </b>Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
* <b>0.0 - </b>Unrelated.

### Files
* <b>train.csv - </b>the training set, containing phrases, contexts, and their similarity scores
* <b>test.csv - </b>the test set set, identical in structure to the training set but without the score
* <b>sample_submission.csv - </b>a sample submission file in the correct format

### Columns
* <b>id - </b>a unique identifier for a pair of phrases
* **anchor -** the first phrase
* **target -** the second phrase
* **context -** the CPC classification (version 2021.05), which indicates the subject within which the similarity is to be scored
* **score -** the similarity. This is sourced from a combination of one or more manual expert ratings.

## Load Data

In [None]:
train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
sample_submission = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
train.isnull().sum()

#### No null values present in training data

In [None]:
score_counts = train["score"].astype(str).value_counts()


fig = go.Figure([go.Bar(
    x=score_counts.index,
    y=score_counts.values,
    text=score_counts.values,
    marker_color=colors[0])])
fig.update_layout(barmode='stack', title_text='Scores Distribution', xaxis_title='Scores', yaxis_title='Counts')
fig.show()

#### So score = 1.0 (very close matches) are very less and score = 0.5 (synonyms which don't have same meanings) are above 12k

In [None]:
anchor_counts = train["anchor"].value_counts()[:15][::-1]

fig = go.Figure([go.Bar(
    x=anchor_counts.values,
    y=anchor_counts.keys(),
    text=anchor_counts.values,
    marker_color=colors[1],
    orientation='h'

)])
fig.update_layout(barmode='stack', title_text='Top 15 anchors count', xaxis_title='Counts',
                  yaxis_title='Anchor', height=600)
fig.update_xaxes(showticklabels=False)
fig.show()

## Word length and character length of target

In [None]:
train['target_wordlen'] = train['target'].apply(lambda x: len(x.split()))
train['target_charlen'] = train['target'].apply(lambda x: len(x))
train.head()

In [None]:
fig = px.histogram(train,
                   x='target_wordlen',
                   color_discrete_sequence=[colors[2]],
                  title='Target word length distribution')
fig.show()

In [None]:
max_targets = train[train['target_wordlen'] >= 10]['target'].values
anchors = train[train['target_wordlen'] >= 10]['anchor'].values

for anch, tar in zip(anchors, max_targets):
    print(f"{anch} : {colored(tar, 'yellow')}")

#### Most of the highest word length are chemical formulas

In [None]:
fig = px.histogram(train,
                   x='target_charlen',
                   color_discrete_sequence=[colors[2]],
                  title='Target character length distribution')
fig.show()

In [None]:
context_counts = train['context'].value_counts()

fig = go.Figure([go.Bar(
    x=context_counts.index,
    y=context_counts.values,
    marker_color=colors[0])])
fig.update_layout(barmode='stack', title_text='Context Distribution', xaxis_title='Context', yaxis_title='Counts')
fig.show()

In [None]:
train.head()

## Word Cloud

In [None]:
anchor_text = ' '.join(train['anchor'])

plt.subplots(figsize=(16,16))
wc = WordCloud(
    stopwords=STOPWORDS,
    background_color="black",
    contour_width=2,
    contour_color='yellow',
    width=1500,
    height=750,
    max_font_size=256,
    max_words=150,
    random_state=1
)
wc.generate(anchor_text)
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
target_text = ' '.join(train['target'])

plt.subplots(figsize=(16,16))
wc = WordCloud(
    stopwords=STOPWORDS,
    background_color="black",
    contour_width=2,
    contour_color='yellow',
    width=1500,
    height=750,
    max_font_size=256,
    max_words=150,
    random_state=1
)
wc.generate(target_text)
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

### Thank you for visiting
### Please upvote this notebook if you find useful.