In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))
print(len(stop_words))

In [None]:
title_df = pd.read_csv("../input/cpc-codes/titles.csv")
title_df.head()

In [None]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
train_df['num_anchor_words'] = train_df.anchor.apply(lambda x: len(x.split()))
train_df['num_target_words'] = train_df.target.apply(lambda x: len(x.split()))
train_df['context_length'] = train_df.context.apply(len)

train_df.head()

In [None]:
print("number of records:", len(train_df))
print("number of contexts:", train_df.context.nunique())
print("number of anchors:", train_df.anchor.nunique())
print("number of targets:", train_df.target.nunique())
print("==============================")
print()
print()

In [None]:
train_df[['anchor', 'num_anchor_words']].drop_duplicates().num_anchor_words.describe()

In [None]:
train_df[['target', 'num_target_words']].drop_duplicates().num_target_words.describe()

In [None]:
train_df.num_anchor_words.value_counts()

In [None]:
train_df.num_target_words.value_counts()

In [None]:
plt.title("Count of the Number Of Words per anchor")
sns.countplot(data=train_df, y='num_anchor_words')
plt.show()

# lets see the context length

In [None]:
print("Number Of distinct Context lenghts", train_df.context_length.nunique())
print()
train_df.context_length.value_counts()

**Context Length is fixed to 3 characters**

from the sources:

https://en.wikipedia.org/wiki/Cooperative_Patent_Classification

https://www.kaggle.com/code/remekkinas/eda-and-feature-engineering


given the context only about the section and class to which the patent belongs to.

In [None]:
section_map={
    'A': 'Human Necessities',
    'B': 'Operations and Transport',
    'C': 'Chemistry and Metallurgy',
    'D': 'Textiles',
    'E': 'Fixed Constructions',
    'F': 'Mechanical Engineering',
    'G': 'Physics',
    'H': 'Electricity',
    'Y': 'Emerging Cross-Sectional Technologie'
}

train_df['section'] = train_df.context.apply(lambda x: section_map[x[0]])
train_df['classes'] = train_df.context.apply(lambda x: int(x[1:]))

train_df.head()

In [None]:
train_df[train_df.context == 'A47'].sort_values('score')

In [None]:
context_df = train_df[['context', 'section', 'classes']].drop_duplicates()
print("number of patents:", len(context_df))
print("number of sections:", context_df.section.nunique())

In [None]:
context_df.section.value_counts()

In [None]:
plt.title("distribution of the sections")
sns.countplot(data=context_df, y='section')
plt.show()

1. train set consists of 8 sections that are described
2. Electricity, Textiles had lesser records compared to the other

# lets check the anchor words per section

In [None]:
anchor_df = train_df[['section', 'anchor']].drop_duplicates()
anchor_df = anchor_df.groupby('anchor',as_index=False)[['section']].agg(list)
anchor_df['num_sections'] = anchor_df.section.apply(len)

anchor_df.head()

In [None]:
anchor_df.num_sections.describe()

In [None]:
plt.title("Distribution of Number of sections Anchor present")
sns.countplot(data=anchor_df, x='num_sections')
plt.show()

In [None]:
target_df = train_df[['section', 'target']].drop_duplicates()
target_df = target_df.groupby('target',as_index=False)[['section']].agg(list)
target_df['num_sections'] = target_df.section.apply(len)

target_df.head()

In [None]:
target_df.num_sections.describe()

In [None]:
plt.title("Distribution of Number of sections Target present")
sns.countplot(data=target_df, x='num_sections')
plt.show()

# lets check for each section

In [None]:
anchor_df = train_df[['section', 'anchor']].drop_duplicates()
anchor_df = anchor_df.groupby('section',as_index=False)[['anchor']].agg(list)
anchor_df['num_anchors'] = anchor_df.anchor.apply(len)

anchor_df

In [None]:
plt.title("Number Of Unique Anchors per section")
sns.barplot(data=anchor_df , y='section', x='num_anchors')
plt.show()

In [None]:
target_df = train_df[['section', 'target']].drop_duplicates()
target_df = target_df.groupby('section',as_index=False)[['target']].agg(list)
target_df['num_targets'] = target_df.target.apply(len)

target_df.head()

In [None]:
plt.title("Number Of Unique Targets per section")
sns.barplot(data=target_df , y='section', x='num_targets')
plt.show()

# lets check exact coverage of the target and anchor to context

In [None]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
title_df = pd.read_csv("../input/cpc-codes/titles.csv")

train_df = train_df.merge(title_df[['code', 'title']],
                          how='inner',
                          left_on = 'context',
                          right_on = 'code')

train_df.head()

In [None]:
def check_target(row):
    title = row.title.lower()
    target = row.target.lower().split()
    target = [t for t in target if t not in stop_words]
    cnt=0
    for t in target:
        if t in title:
            cnt+=1
    return (cnt/max(1, len(target)))
    
    
train_df['is_target_exists'] = train_df.apply(check_target, axis=1)
train_df.head()

In [None]:
print("Coverage of the target samples appearing directly in the context text:", len(train_df[train_df.is_target_exists!=0] )/len(train_df))

# lets group all the context and check for the coverage

In [None]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
title_df = pd.read_csv("../input/cpc-codes/titles.csv")

title_df['section_class'] = title_df['code'].apply(lambda x: x[:3])
title_group_df = title_df.groupby('section_class', as_index=False)[['title']].agg(list)

title_group_df['title'] = title_group_df['title'].apply(lambda lst: ' '.join(lst))

train_df = train_df.merge(title_group_df, how='inner', left_on = 'context', right_on='section_class')
train_df['is_target_exists'] = train_df.apply(check_target, axis=1)

print("Coverage of the target samples appearing directly in the context text:", len(train_df[train_df.is_target_exists!=0] )/len(train_df))

In [None]:
title_group_df['num_words'] = title_group_df.title.apply(lambda x: len(x.split()))
title_group_df.head()

In [None]:
title_df[title_df.code.apply(lambda x: len(x)==3)]

In [None]:
title_df.head()