In [None]:
# Setting html stuff for the rest of the notebook
from IPython.core.display import display, HTML, Javascript
html_contents ="""
<!DOCTYPE html>
<html lang="en">
    <head>
    <style>
    
    .top_section{
        background-color: #66F3ED;
        color: white;
        font-family: Copperplate, Papyrus, fantasy;
        font-weight: 800;
        font-size: 35px;
        padding: 20px 14px;
        margin-bottom: 20px;
    }
    
 
    </style>
    </head>
    
</html>
"""

HTML(html_contents)

<center><strong><h1> <div class="top_section">Competition Description</div></h1></strong></center>


Can you extract meaning from a large, text-based dataset derived from inventions? Here's your chance to do so.

The U.S. Patent and Trademark Office (USPTO) offers one of the largest repositories of scientific, technical, and commercial information in the world through its Open Data Portal. Patents are a form of intellectual property granted in exchange for the public disclosure of new and useful inventions. Because patents undergo an intensive vetting process prior to grant, and because the history of U.S. innovation spans over two centuries and 11 million patents, the U.S. patent archives stand as a rare combination of data volume, quality, and diversity.


In this competition, you will train your models on a novel semantic similarity dataset to extract relevant information by matching key phrases in patent documents. Determining the semantic similarity between phrases is critically important during the patent search and examination process to determine if an invention has been described before. For example, if one invention claims "television set" and a prior publication describes "TV set", a model would ideally recognize these are the same and assist a patent attorney or examiner in retrieving relevant documents. This extends beyond paraphrase identification; if one invention claims a "strong material" and another uses "steel", that may also be a match. What counts as a "strong material" varies per domain (it may be steel in one domain and ripstop fabric in another, but you wouldn't want your parachute made of steel). We have included the Cooperative Patent Classification as the technical domain context as an additional feature to help you disambiguate these situations.




<center><strong><h1> <div class="top_section">Data Description</div></h1></strong></center>


In this dataset, you are presented pairs of phrases (an anchor and a target phrase) and asked to rate how similar they are on a scale from 0 (not at all similar) to 1 (identical in meaning). This challenge differs from a standard semantic similarity task in that similarity has been scored here within a patent's context, specifically its CPC classification (version 2021.05), which indicates the subject to which the patent relates. For example, while the phrases "bird" and "Cape Cod" may have low semantic similarity in normal language, the likeness of their meaning is much closer if considered in the context of "house".

This is a code competition, in which you will submit code that will be run against an unseen test set. The unseen test set contains approximately 12k pairs of phrases. A small public test set has been provided for testing purposes, but is not used in scoring.

Information on the meaning of CPC codes may be found on the USPTO website. The CPC version 2021.05 can be found on the CPC archive website.

## Score meanings
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

* **1.0** - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
* **0.75** - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
* **0.5** - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
* **0.25** - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
* **0.0** - Unrelated.

## Files
* **train.csv** - the training set, containing phrases, contexts, and their similarity scores
* **test.csv** - the test set set, identical in structure to the training set but without the score
* **sample_submission.csv** - a sample submission file in the correct format


## Columns
* `id` - a unique identifier for a pair of phrases
* `anchor` - the first phrase
* `target` - the second phrase
* `context` - the CPC classification (version 2021.05), which indicates the subject within which the similarity is to be scored
* `score` - the similarity. This is sourced from a combination of one or more manual expert ratings.


<center><strong><h1> <div class="top_section">IMPORTS</div></h1></strong></center>


In [None]:
import pandas as pd

import re
from typing import Union, List
import string
from wordcloud import WordCloud


In [None]:
import pandas as pd
from wordcloud import WordCloud
import seaborn as sns
import re
import string
from collections import Counter, defaultdict

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords_nltk = set(stopwords.words('english'))


<center><strong><h1> <div class="top_section">Helper Functions</div></h1></strong></center>


In [None]:
class CleanText():
    """ clearing text except digits () . , word character """ 

    def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
        self.clean_pattern =clean_pattern

    def __call__(self, text: Union[str, list]) -> List[List[str]]:

        if isinstance(text, str):
            docs = [[text]]

        if isinstance(text, list):
            docs = text

        text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs]

        return text
    

def tokenize(text):
    """ basic tokenize method with word character, non word character and digits """
    text = re.sub(r" +", " ", str(text))
    text = re.split(r"(\d+|[a-zA-ZğüşıöçĞÜŞİÖÇ]+|\W)", text)
    text = list(filter(lambda x: x != '' and x != ' ', text))
    sent_tokenized = ' '.join(text)
    return sent_tokenized

regex = re.compile('[%s]' % re.escape(string.punctuation))


clean = CleanText()

In [None]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
print("TRAIN Shape: ", train_df.shape)
print("TEST Shape: ", test_df.shape)

In [None]:
train_df.head(10)

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum(axis = 0)

In [None]:
test_df.head(10)

In [None]:
test_df.info()

In [None]:
test_df.isnull().sum(axis = 0)

In [None]:
train_df.anchor.value_counts()

<center><strong><h1> <div class="top_section">EDA</div></h1></strong></center>


In [None]:
# count of ratings
import plotly.express as px
fig = px.histogram(train_df,
             x = 'anchor',

             template = 'ggplot2',
             color = 'anchor',
             color_discrete_sequence= px.colors.sequential.Blues_r,
             opacity = 0.8,
             height = 525,
             width = 835,
            )

fig.update_yaxes(title='Count')
fig.show()

In [None]:
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='black',
        max_words=200,
        max_font_size=40, 
        scale=1,
        random_state=1
).generate(" ".join(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

## **Word Cloud for Anchor**

In [None]:
show_wordcloud(train_df["anchor"].values)


## **Word Cloud for Target**

In [None]:
show_wordcloud(train_df["target"].values)


### **Token Coutrain_df with simple tokenizer**

Finding out the number of tokens available for each sample will give us information about the length of our data. The classification algorithm we will use for a long text will not be the same as the algorithm used for a short text.

In [None]:
# tokenize data
train_df["tokenized_anchor"] = train_df.anchor.apply(lambda x: tokenize(x))
train_df["tokenized_target"] = train_df.target.apply(lambda x: tokenize(x))
# calculate token count for any sent
train_df["anchor_token_length"] = train_df["tokenized_anchor"].apply(lambda x: len(x.split()))
train_df["target_token_length"] = train_df["tokenized_target"].apply(lambda x: len(x.split()))

## **Token Length for Anchor**

In [None]:
fig = px.histogram(train_df, x="anchor_token_length", nbins=20, color_discrete_sequence=px.colors.cmocean.algae, barmode='group', histnorm="percent")
fig.show()

## **Token Length for Target**

In [None]:
fig = px.histogram(train_df, x="target_token_length", nbins=20, color_discrete_sequence=px.colors.cmocean.algae, barmode='group', histnorm="percent")
fig.show()

In [None]:
# valvulate char count for each review
train_df['target_char_count'] = train_df['target'].apply(lambda x: len(str(x)))
train_df['anchor_char_count'] = train_df['anchor'].apply(lambda x: len(str(x)))


def plot_dist3(df, feature, title):
    fig = plt.figure(constrained_layout=True, figsize=(18, 8))
    grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)
    ax1 = fig.add_subplot(grid[0, :2])
    ax1.set_title('Histogram')
    sns.distplot(df.loc[:, feature],
                 hist=True,
                 kde=True,
                 ax=ax1,
                 )
    ax1.set(ylabel='Frequency')
    ax1.xaxis.set_major_locator(MaxNLocator(nbins=20))


    plt.suptitle(f'{title}', fontsize=24)

In [None]:
plot_dist3(train_df, 'target_char_count',
           'Characters Count in target')

In [None]:
plot_dist3(train_df, 'anchor_char_count',
           'Characters Count in anchor')

### **Most Common Words**

In [None]:
texts = train_df.tokenized_anchor
new = texts.str.split()
new = new.values.tolist()
corpus = [word for i in new for word in i]
counter = Counter(corpus)
most = counter.most_common()
x, y = [], []
for word, count in most[:30]:
    if word not in stopWords_nltk:
        x.append(word)
        y.append(count)

fig = go.Figure(go.Bar(
            x=y,
            y=x,
            orientation='h',  marker=dict(
        color='rgba(50, 171, 96, 0.6)',
        line=dict(
            color='rgba(50, 171, 96, 1.0)',
            width=1),
    ),
    name='Most common Word',))

fig.update_layout( title={
        'text': "Most Common Words in Anchor",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}, font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))

fig.show()

In [None]:
texts = train_df.tokenized_target
new = texts.str.split()
new = new.values.tolist()
corpus = [word for i in new for word in i]
counter = Counter(corpus)
most = counter.most_common()
x, y = [], []
for word, count in most[:30]:
    if word not in stopWords_nltk:
        x.append(word)
        y.append(count)

fig = go.Figure(go.Bar(
            x=y,
            y=x,
            orientation='h',  marker=dict(
        color='rgba(50, 171, 96, 0.6)',
        line=dict(
            color='rgba(50, 171, 96, 1.0)',
            width=1),
    ),
    name='Most common Word',))

fig.update_layout( title={
        'text': "Most Common Words in Target",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}, font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))

fig.show()

In [None]:
context_dict = {
    'A': 'Human Necessities',
    'B': 'Operations and Transport',
    'C': 'Chemistry and Metallurgy',
    'D': 'Textiles',
    'E': 'Fixed Constructions',
    'F': 'Mechanical Engineering',
    'G': 'Physics',
    'H': 'Electricity',
    'Y': 'Emerging Cross-Sectional Technologies'
}


In [None]:
train_df['context'].str.len().max()

In [None]:
train_df

In [None]:
# count of score
fig = px.bar(train_df,
             x = train_df.score.value_counts().index,
             y= train_df.score.value_counts().values,
           
            )

fig.update_yaxes(title='Count')
fig.show()