# Exploratory data analysis

In [None]:
import re
import os
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls /content/drive/

In [None]:
PROJECT_DIR = "/content/drive/MyDrive/patent"

In [None]:
DATA_DIR = PROJECT_DIR + '/data'

In [None]:
FILE_SEP = "|"
BERT_SEP = " [SEP] "
#BERT_SEP = " . "

# Patent Data

In [None]:
patent_df = pd.read_csv(DATA_DIR + "/patent_orig/train.csv")

In [None]:
patent_df.shape

In [None]:
patent_df.describe(include='all')

In [None]:
patent_df.columns

In [None]:
patent_df.info()

In [None]:
patent_df.head(10)

In [None]:
patent_df['anchor_len'] = patent_df['anchor'].str.split().str.len()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of Anchor length

In [None]:
ax = sns.countplot(patent_df['anchor_len'])

In [None]:
patent_df["score"] = patent_df["score"].astype("category")

# Distribution of target variable

In [None]:
ax = sns.countplot(patent_df['score'])


# Histogram of Context

In [None]:
plt.figure(figsize=(30,5))
ax = sns.countplot(patent_df['context'], order=patent_df['context'].value_counts().index)
ax.tick_params(axis='x', rotation=90)

# CPC Data

In [None]:
contexts = []
pattern = '[A-Z]\d+'
dir_list = os.listdir(os.path.join(DATA_DIR, 'cpc-data/CPCSchemeXML202205'))
print(len(dir_list))


In [None]:
for file_name in os.listdir(os.path.join(DATA_DIR, 'cpc-data/CPCSchemeXML202205')):
  result = re.findall(pattern, file_name)
  if result:
    contexts.append(result)
contexts = sorted(set(sum(contexts, [])))

In [None]:
len(contexts)

In [None]:

results = {}

for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
  with open(os.path.join(DATA_DIR, f'cpc-data/CPCTitleList202205/cpc-section-{cpc}_20220501.txt'
  )) as f:
    s = f.read()
  pattern = f'{cpc}\t\t.+'
  result = re.findall(pattern, s)
  pattern = "^" + pattern[:-2]
  cpc_result = re.sub(pattern, "", result[0])
  for context in [c for c in contexts if c[0] == cpc]:
    pattern = f'{context}\t\t.+'
    result = re.findall(pattern, s)
    pattern = "^" + pattern[:-2]
    sep = ", "
    #results[context] = cpc_result + sep + re.sub(pattern, "", result[0])
    results[context] = cpc_result.lower() + sep + re.sub(pattern, "", result[0]).lower()
    #results[context] = results[context].lower()

In [None]:
len(results)

# SNLI Dataset

In [None]:
snli_df = pd.read_csv(DATA_DIR + "/nli_for_simcse.csv")

In [None]:
snli_df.shape 

In [None]:
snli_df.info()

In [None]:
snli_df.describe(include='all')

In [None]:
snli_df.head()