In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from termcolor import colored
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud , STOPWORDS
import bq_helper
from bq_helper import BigQueryHelper
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')


In [None]:
train_df.head()

In [None]:
len(list(train_df['target'].unique()))

In [None]:
train_df.sample(10)


In [None]:
train_df.isnull().sum()

In [None]:
train_df[train_df.drop('id' , axis =1).duplicated()]

In [None]:
train_df.anchor.nunique()
print(train_df.anchor.value_counts().head())

In [None]:
pattern = 'base'
mask = train_df['target'].str.contains(pattern, case=False, na=False)
train_df.query("anchor =='component composite coating'")[mask]

In [None]:
anchor_desc = train_df[train_df.anchor.notnull()].anchor.values
stopwords = set(STOPWORDS) 
wordcloud = WordCloud(width = 800, 

                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(anchor_desc)) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show()

In [None]:
train_df['anchor_len'] = train_df['anchor'].str.split().str.len()

print(f"Anchors with maximum lenght of 5: \n{colored(train_df.query('anchor_len == 5')['anchor'].unique(), 'yellow')}")
print(f"\nAnchors with maximum lenght of 4: \n{colored(train_df.query('anchor_len == 4')['anchor'].unique(), 'green')}")

**GETTING VALUE COUNTS**

In [None]:
train_df.anchor_len.hist(orientation='horizontal', color='#FFCF56')


In [None]:
pattern = '[0-9]'
mask = train_df['anchor'].str.contains(pattern, na=False)
train_df['num_anchor'] = mask
train_df[mask]['anchor'].value_counts()

**TARGET COLUMN**

In [None]:
print(colored(train_df.target.nunique(), 'yellow'))

**for target**

In [None]:
target_desc = train_df[train_df.target.notnull()].target.values
stopwords = set(STOPWORDS) 
wordcloud = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(target_desc)) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

In [None]:
train_df['target_len'] = train_df['target'].str.split().str.len()
train_df.target_len.value_counts()

In [None]:
print(f"Targets with maximum lenght of 11: \n{colored(train_df.query('target_len == 11')['target'].unique(), 'yellow')}")
print(f"\nTargets with lenght of 10: \n{colored(train_df.query('target_len == 10')['target'].unique(), 'green')}")
print(f"\nTargets with lenght of 9: \n{colored(train_df.query('target_len == 9')['target'].unique(), 'yellow')}")
print(f"\nTargets with lenght of 8: \n{colored(train_df.query('target_len == 8')['target'].unique(), 'green')}")

In [None]:
# Checking numbers in target feature

pattern = '[0-9]'
mask = train_df['target'].str.contains(pattern, na=False)
train_df['num_target'] = mask
train_df[mask]['target'].value_counts()

**For Context column**

In [None]:
print(f"Number of uniques values in CONTEXT column: {colored(train_df.context.nunique(), 'yellow')}")


In [None]:
train_df.context.value_counts()

Source: https://en.wikipedia.org/wiki/Cooperative_Patent_Classification

# The first letter is the "section symbol" consisting of a letter from "A" ("Human Necessities") to "H" ("Electricity") or "Y" for emerging cross-sectional technologies. This is followed by a two-digit number to give a "class symbol" ("A01" represents "Agriculture; forestry; animal husbandry; trapping; fishing").
A: Human Necessities
B: Operations and Transport
C: Chemistry and Metallurgy
D: Textiles
E: Fixed Constructions
F: Mechanical Engineering
G: Physics
H: Electricity
Y: Emerging Cross-Sectional Technologies
Hierarchy
Section (one letter A to H and also Y)
Class (two digits)

***for seperating section and class from context***

In [None]:
train_df['section'] = train_df['context'].astype(str).str[0]
train_df['classes'] = train_df['context'].astype(str).str[1:]
train_df.head(10)

In [None]:
print(f"Number of uniques SECTIONS: {colored(train_df.section.nunique(), 'yellow')}")
print(f"Number of uniques CLASS: {colored(train_df.classes.nunique(), 'yellow')}")

In [None]:
di = {"A" : "A - Human Necessities", 
      "B" : "B - Operations and Transport",
      "C" : "C - Chemistry and Metallurgy",
      "D" : "D - Textiles",
      "E" : "E - Fixed Constructions",
      "F" : "F- Mechanical Engineering",
      "G" : "G - Physics",
      "H" : "H - Electricity",
      "Y" : "Y - Emerging Cross-Sectional Technologies"}

In [None]:
train_df.replace({"section": di}).section.hist(orientation='horizontal', color='#FFCF56')

In [None]:
train_df.classes.value_counts().head(15)
