# Prepare Patent dataset to be used for Evaluation

In [None]:
import re
import os
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls /content/drive/

In [None]:
PROJECT_DIR = "/content/drive/MyDrive/patent"

In [None]:
DATA_DIR = PROJECT_DIR + '/data'

In [None]:
FILE_SEP = "|"
BERT_SEP = " [SEP] "
#BERT_SEP = " . "

# CPC Data

In [None]:
def get_cpc_texts():
  contexts = []
  pattern = '[A-Z]\d+'
  for file_name in os.listdir(os.path.join(DATA_DIR, 'cpc-data/CPCSchemeXML202205')):
    result = re.findall(pattern, file_name)
    if result:
      contexts.append(result)
  contexts = sorted(set(sum(contexts, [])))
  results = {}

  for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
    with open(os.path.join(DATA_DIR, f'cpc-data/CPCTitleList202205/cpc-section-{cpc}_20220501.txt'
    )) as f:
      s = f.read()
    pattern = f'{cpc}\t\t.+'
    result = re.findall(pattern, s)
    pattern = "^" + pattern[:-2]
    cpc_result = re.sub(pattern, "", result[0])
    for context in [c for c in contexts if c[0] == cpc]:
      pattern = f'{context}\t\t.+'
      result = re.findall(pattern, s)
      pattern = "^" + pattern[:-2]
      sep = BERT_SEP
      #results[context] = cpc_result + sep + re.sub(pattern, "", result[0])
      results[context] = cpc_result.lower() + sep + re.sub(pattern, "", result[0]).lower()
      #results[context] = results[context].lower()
  return results

In [None]:
cpc_texts = get_cpc_texts()

# Patent Train Data

In [None]:
df = pd.read_csv(DATA_DIR + "/patent_orig/train.csv")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.head(10)

# Augment with CPC data

In [None]:
df['context_text'] = df['context'].map(cpc_texts)

In [None]:
df['anchor'] = df['anchor'] + BERT_SEP + df['context_text']

In [None]:
df.head()

In [None]:
df.shape

# Dataset for Evaluation

In [None]:
def train_test_split(df):
  train_df = df.iloc[0 : int(df.shape[0]*.7)]
  print(train_df.shape)

  test_df = df.iloc[int(train_df.shape[0]) : ]
  print(test_df.shape)

  train_df.to_csv(DATA_DIR + '/patent_train.csv', sep = FILE_SEP , index=False)
  
  test_df.to_csv(DATA_DIR + '/patent_validation.csv', sep = FILE_SEP , index=False)

  return train_df, test_df

In [None]:
train_df, test_df =  train_test_split(df)

# Test Data

In [None]:
test_df = pd.read_csv(DATA_DIR + "/patent_orig/test.csv")

In [None]:
test_df['context_text'] = test_df['context'].map(cpc_texts)

In [None]:
test_df['anchor'] = test_df['anchor'] + BERT_SEP + test_df['context_text']

In [None]:
test_df.to_csv(DATA_DIR + '/patent_test.csv', sep = FILE_SEP , index=False)