# Prepare Patent dataset to be used for PromCSE Training

In [None]:
import re
import os
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls /content/drive/

In [None]:
PROJECT_DIR = "/content/drive/MyDrive/patent"

In [None]:
DATA_DIR = PROJECT_DIR + '/data'

In [None]:
BERT_SEP = " [SEP] "
#BERT_SEP = " . "
TARGET_SEP = " ; "
DF_SEP = "|"

# Join CPC text by their code

In [None]:
def get_cpc_texts():
  contexts = []
  pattern = '[A-Z]\d+'
  for file_name in os.listdir(os.path.join(DATA_DIR, 'cpc-data/CPCSchemeXML202205')):
    result = re.findall(pattern, file_name)
    if result:
      contexts.append(result)
  contexts = sorted(set(sum(contexts, [])))
  results = {}

  for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
    with open(os.path.join(DATA_DIR, f'cpc-data/CPCTitleList202205/cpc-section-{cpc}_20220501.txt'
    )) as f:
      s = f.read()
    pattern = f'{cpc}\t\t.+'
    result = re.findall(pattern, s)
    pattern = "^" + pattern[:-2]
    cpc_result = re.sub(pattern, "", result[0])
    for context in [c for c in contexts if c[0] == cpc]:
      pattern = f'{context}\t\t.+'
      result = re.findall(pattern, s)
      pattern = "^" + pattern[:-2]
      sep = BERT_SEP
      #results[context] = cpc_result + sep + re.sub(pattern, "", result[0])
      results[context] = cpc_result.lower() + sep + re.sub(pattern, "", result[0]).lower()
      #results[context] = results[context].lower()
  return results

In [None]:
cpc_texts = get_cpc_texts()

# Patent Data

In [None]:
df = pd.read_csv(DATA_DIR + "/patent_orig/train.csv")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.head(10)

# Positive targets are with score = 1, 0.75, 0.5 and Negative targets are with score = 0, 0.25

In [None]:
df_pos = df[df.score >= 0.5 ]
df_pos.shape

In [None]:
df_neg = df[df.score < 0.5 ]
df_neg.shape

# Join positive targets. Create new row with this positive target

In [None]:
df_pos.head(20)

In [None]:
agg_pos = df_pos.groupby(['anchor', 'context'])['target'].apply(list).reset_index(name = 'target')


In [None]:
agg_pos['target'] =  agg_pos['target'].apply(lambda l: TARGET_SEP.join(l))

In [None]:
agg_pos['score']=0.50

In [None]:
agg_pos.head(10)

In [None]:
df_pos = pd.concat([df_pos, agg_pos])

In [None]:
df_pos.shape

In [None]:
df_pos.loc[(df_pos['context']=='A61') & (df_pos['anchor']=='abatement')].head(20)

In [None]:
df_pos['sen t1'] = df_pos['target']

In [None]:
df_pos.drop(['id', 'target', 'score'], axis=1, inplace=True)

In [None]:
df_pos.drop_duplicates(inplace=True)

# Join negative targets. Add to hard negative for that context and anchor.

In [None]:
df_neg.head()

In [None]:
agg_neg = df_neg.groupby(['anchor', 'context'])['target'].apply(list).reset_index(name = 'target')
agg_neg['target'] =  agg_neg['target'].apply(lambda l: TARGET_SEP.join(l))
agg_neg['score']=0.00

In [None]:
agg_neg.head(10)

In [None]:
df_neg = pd.concat([df_neg, agg_neg])

In [None]:
df_neg.shape

In [None]:
df_neg.loc[(df_neg['context']=='A61') & (df_neg['anchor']=='abatement')].head(20)

In [None]:
df_neg['hard_neg'] = df_neg['target']

In [None]:
df_neg.drop(['id', 'target', 'score'], axis=1, inplace=True)

In [None]:
df_neg.drop_duplicates(inplace=True)

In [None]:
df_pos.head()

In [None]:
df_pos.shape

In [None]:
df_neg.head()

In [None]:
df_neg.shape

# Negative samples are negative for that context, may be positive for other contexts

In [None]:
df_pos['key'] = 1
df_neg['key'] = 1

In [None]:
df_all = pd.merge(df_pos, df_neg, on =['key', 'anchor', 'context' ]).drop("key", 1)

In [None]:
df_all.shape

In [None]:
df_all.head(20)

# Augment with CPC data

In [None]:
df_all['context_text'] = df_all['context'].map(cpc_texts)

# SimCSE Format

In [None]:
df_all['sent0'] = df_all['anchor'] + BERT_SEP + df_all['context_text']

In [None]:
df_all.head()

In [None]:
df_all.drop(['anchor', 'context', 'context_text' ], axis=1, inplace=True)

In [None]:
df_all.shape

In [None]:
df_all.to_csv(DATA_DIR + '/patent_train_simcse.csv', sep=DF_SEP, index=False)