# Data EDA

 - reference
  - https://www.kaggle.com/code/remekkinas/eda-and-feature-engineering
  - https://www.kaggle.com/code/utcarshagrawal/usppm-complete-eda-pytorch-baseline
  - https://www.kaggle.com/code/hasanbasriakcay/patent-phrase-matching-eda-fe-baseline
  - https://www.kaggle.com/code/jhoward/iterate-like-a-grandmaster
  - https://www.kaggle.com/code/don9wankim/us-patent-eda-for-kokr
  

## USPPM train, test data

In [None]:
import numpy as np
import pandas as pd

from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from termcolor import colored
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import torch
import os
import re

warnings.filterwarnings("ignore")

In [None]:
# Road train, test data
train_data = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test_data = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [None]:
# Check the train, test shape
print(f"TRAIN DATA : {train_data.shape}")
print(f"TEST DATA : {test_data.shape}")

**Columns**

* id - a unique identifier for a pair of phrases
* anchor - the first phrase
* target - the second phrase
* context - the CPC classification (version 2021.05), which indicates the subject within which the similarity is to be scored
* score - the similarity. This is sourced from a combination of one or more manual expert ratings.

In [None]:
train_data.sample(5)

In [None]:
test_data.sample(5)

**Score mearnings**

* 1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).

* 0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".

* 0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.

* 0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.

* 0.0 - Unrelated.

In [None]:
# Check the sscore distribution
train_data['score'].hist()

In [None]:
# Number of Phrases in Anchor

leng=[]

for idx in range(len(train_data)):
    leng.append(len(train_data['anchor'][idx].split()))

counter=Counter(leng).most_common()

length = [key for key, _ in counter]
freq = [value for _, value in counter]


x = np.arange(len(length))

plt.bar(x, freq)
plt.xticks(x, length)

for i, v in enumerate(x):
    plt.text(v, freq[i], freq[i],   # Coordinates (x-axis = v, y-axis = y[0]..y[1], mark = y[0]..y[1])
             fontsize = 9, 
             color='black',
             horizontalalignment='center',  # horizontalalignment (left, center, right)
             verticalalignment='bottom')

plt.show()

In [None]:
# Number of Phrases in Target

leng=[]

for idx in range(len(train_data)):
    leng.append(len(train_data['target'][idx].split()))

counter=Counter(leng).most_common()

length = [key for key, _ in counter]
freq = [value for _, value in counter]


x = np.arange(len(length))

plt.bar(x, freq)
plt.xticks(x, length)

for i, v in enumerate(x):
    plt.text(v, freq[i], freq[i],   # Coordinates (x-axis = v, y-axis = y[0]..y[1], mark = y[0]..y[1])
             fontsize = 9, 
             color='black',
             horizontalalignment='center',  # horizontalalignment (left, center, right)
             verticalalignment='bottom')

plt.show()

# CPC Data

In [None]:
# Read CPC data

cpc_data = pd.read_csv('../input/cpc-codes/titles.csv')

In [None]:
# context_dict = {
#     'A': 'Human Necessities',
#     'B': 'Operations and Transport',
#     'C': 'Chemistry and Metallurgy',
#     'D': 'Textiles',
#     'E': 'Fixed Constructions',
#     'F': 'Mechanical Engineering',
#     'G': 'Physics',
#     'H': 'Electricity',
#     'Y': 'Emerging Cross-Sectional Technologies'
# }

context_dict = {
    'A': 'Human Necessities',
    'B': 'Operations and Transport',
    'C': 'Chemistry and Metallurgy',
    'D': 'Textiles',
    'E': 'Fixed Constructions',
    'F': 'Mechanical Engineering',
    'G': 'Physics',
    'H': 'Electricity',
    'Y': 'Emerging Cross-Sectional Technologies'
}

In [None]:
cpc_data.sample(5)

In [None]:
# Merge Data ( CPC + USPPM )

def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
            
    return results

In [None]:
cpc_texts = get_cpc_texts()
torch.save(cpc_texts, "./cpc_texts.pth")
train_data['context_text'] = train_data['context'].map(cpc_texts)
#test_data['context_text'] = test_data['context'].map(cpc_texts)

train_data.sample(5)

In [None]:
train_data['text'] = train_data['anchor'] + '[SEP]' + train_data['target'] + '[SEP]'  + train_data['context_text']
#test_data['text'] = test_data['anchor'] + '[SEP]' + test_data['target'] + '[SEP]'  + test_data['context_text']

train_data.sample(5)

# Submission file

In [None]:
sub = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

sub.head()

# Submission file analyze

* #### Compare the top three models with the highest score submission score

In [None]:
# Comparison of submission results by model

# 4ensemble : Deberta-v3-large + bert-for-patent-5fold + deberta-large-v1 + xml-roberta-large-5folds
# deberta_electra : Deberta-v3-large + Electra
# 2de+Ro : Deberta-v3-large * 2 + Roberta

# 1 --> 4ensemble score
# 2 --> deberta_ele score
# 3 --> 2de + 1Ro score

sub = pd.read_csv("../input/usppm-all-submission/all_submission.csv")

sub.head()

# if NaN --> Same Score

In [None]:
sub.isna().sum()  # --> It can be seen that 18 out of 36 points are certain

In [None]:
sub_remove = sub.dropna()

In [None]:
sub_remove

In most cases, it can be seen that the **value of 0** is not well distinguished.

Based on the relatively best performance of the 4ensembele, it was confirmed that the value of de+1Ro was the most similar, but in the case of the model with deberta and electrica, there was a large difference in score.

## Future Work

Reinforcement learning and addition of insufficient data are required for areas with different scores

In particular, we believe that it will be important to develop a sense of the score of 0