## References:
- Taken help from this amazing notebook for the data visualization part: https://www.kaggle.com/code/aishwarya2210/prediction-of-tweets-using-bert-model/notebook
- Context Code Meaning: https://en.wikipedia.org/wiki/Cooperative_Patent_Classification

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
from wordcloud import WordCloud

## Config Variables

In [None]:
ROOT_PATH = "/kaggle/input/us-patent-phrase-to-phrase-matching"
TRAIN_PATH = os.path.join(ROOT_PATH, "train.csv")
TEST_PATH = os.path.join(ROOT_PATH, "test.csv")

## Load Data

In [None]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
print("df_train.shape={}".format(df_train.shape))
print("df_test.shape={}".format(df_test.shape))

In [None]:
df_train.head()

In [None]:
df_test.head()

## EDA on train data

In [None]:
fig, ax = plt.subplots(figsize=(30, 15))
sns.countplot(x='context', data=df_train, ax=ax)
plt.xticks(rotation= 45)
del ax

In [None]:
fig, ax = plt.subplots(figsize=(30, 15))
sns.countplot(x='anchor', data=df_train, ax=ax)
plt.xticks(rotation= 45)
del ax

## EDA on test data

In [None]:
fig, ax = plt.subplots(figsize=(30, 15))
sns.countplot(x='context', data=df_test, ax=ax)
plt.xticks(rotation= 45)
del ax

In [None]:
fig, ax = plt.subplots(figsize=(30, 15))
sns.countplot(x='anchor', data=df_test, ax=ax)
plt.xticks(rotation= 45)
del ax

## Context Code Broad Meaning (source link: https://en.wikipedia.org/wiki/Cooperative_Patent_Classification)

In [None]:
CONTEXT_DICT = {
    "A": "Human Necessities",
    "B": "Operations and Transport",
    "C": "Chemistry and Metallurgy",
    "D": "Textiles",
    "E": "Fixed Constructions",
    "F": "Mechanical Engineering",
    "G": "Physics",
    "H": "Electricity",
    "Y": "Emerging Cross-Sectional Technologies"
}

## Score Bins details

The scores are in the 0-1 range with increments of 0.25 with the following meanings:
- 1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
- 0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
- 0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
- 0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
- 0.0 - Unrelated.

## Context details from train data

In [None]:
# unique context in train data
train_context_arr = pd.unique(df_train['context'].values)
print()
print("train_context_arr={}\n".format(train_context_arr))
print("len(train_context_arr)={}".format(len(train_context_arr)))

## Context details from test data

In [None]:
# unique context in train data
test_context_arr = pd.unique(df_test['context'].values)
print()
print("test_context_arr={}\n".format(test_context_arr))
print("len(test_context_arr)={}".format(len(test_context_arr)))

In [None]:
# test_ctx not present in train_ctx
common_ctx_arr = []
for ctx in test_context_arr:
    if ctx in train_context_arr:
        common_ctx_arr.append(ctx)

print()
print("common_ctx_arr={}\n".format(common_ctx_arr))
print("len(common_ctx_arr)={}".format(len(common_ctx_arr)))

## WordCloud/Context on train data

In [None]:
# WordCloud per context

stop_words = set(stopwords.words('english'))
df_train['target'] = df_train['target'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

for ctx in train_context_arr:
    # print(ctx+"\n")
    patents_by_ctx = df_train[(df_train.context == ctx) & ((df_train.score==0.50) | (df_train.score==0.75) | (df_train.score==1.0))]
    patents_string = []
    for t in patents_by_ctx.target:
        patents_string.append(t)
    patents_string = pd.Series(patents_string).str.cat(sep=' ')
    wordcloud = WordCloud(width=1600, height=800, max_font_size=100, background_color='white').generate(patents_string)
    plt.figure(figsize=(50,20))
    ctx_score_title = "Context: " + ctx + "(" + CONTEXT_DICT[ctx[0]] + "), Score >= 0.5"
    plt.suptitle(ctx_score_title, fontsize=20) #, color='red', fontweight='bold')
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    print("============================================================================================================================================================")

## WordCloud/Context on test data

In [None]:
# WordCloud per context
stop_words = set(stopwords.words('english'))
df_test['target'] = df_test['target'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

for ctx in test_context_arr:
    patents_by_ctx = df_test[df_test.context == ctx]
    patents_string = []
    for t in patents_by_ctx.target:
        patents_string.append(t)
    patents_string = pd.Series(patents_string).str.cat(sep=' ')
    wordcloud = WordCloud(width=1600, height=800, max_font_size=100, background_color='white').generate(patents_string)
    plt.figure(figsize=(50,20))
    ctx_score_title = "Context: " + ctx + "(" + CONTEXT_DICT[ctx[0]] + ")"
    plt.suptitle(ctx_score_title, fontsize=20) #, color='red', fontweight='bold')
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    print("============================================================================================================================================================")

## Adding some meta-feature

## Visualizing meta-features