<a href="https://colab.research.google.com/github/sjpark0605/NLP-FYP/blob/main/Entity_Marker_Flow_Recipe_Data_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install datasets

In [None]:
# Imports for Data Processing
import glob
import csv
import numpy as np
import pandas as pd
import pickle

from datasets import Dataset, ClassLabel, Sequence, DatasetDict

In [None]:
PROJECT_DIR = '/content/drive/MyDrive/COMP0029/'
SEED = 2023

In [None]:
# CHANGE THIS CONSTANT TO EITHER r-100, r-200, or r-300
TARGET_CORPUS = 'r-300'
UNDERSAMPLE_FACTOR = 0.9

In [None]:
GLOBAL_INCLUDED_COUNT, GLOBAL_REJECTED_COUNT, GLOBAL_EDGE_COUNT = 0, 0, 0

In [None]:
NER_FILES, FLOW_FILES = [], []

if TARGET_CORPUS == 'r-100' or TARGET_CORPUS == 'r-200':
  NER_FILES += glob.glob(PROJECT_DIR + TARGET_CORPUS + '/*.list')
  FLOW_FILES += glob.glob(PROJECT_DIR + TARGET_CORPUS + '/*.flow')
elif TARGET_CORPUS == 'r-300':
  for corpus in ['r-100', 'r-200']:
    NER_FILES += glob.glob(PROJECT_DIR + corpus + '/*.list')
    FLOW_FILES += glob.glob(PROJECT_DIR + corpus + '/*.flow')
else:
  raise Exception("Could not recognize target corpus")

NER_FILES.sort()
FLOW_FILES.sort()

In [None]:
def encode_key(decoded_key):
  if len(decoded_key) != 3:
    raise Exception("Malformed Key During Encoding: Length != 3")
  
  return str(decoded_key[0]) + ';' + str(decoded_key[1]) + ';' + str(decoded_key[2])

def decode_key(encoded_key):
  encoded_key = encoded_key.split(';')

  if len(encoded_key) != 3:
    raise Exception("Malformed Key During Decoding: Length != 3")
  return (int(encoded_key[0]), int(encoded_key[1]), int(encoded_key[2]))

In [None]:
def construct_recipe_dict(ner_lines, remove_iob = False):
  word_dict = {}
  ner_dict = {}

  for line in ner_lines:
    items = line.strip().split(" ")

    key, word, ner_tag = encode_key(items[:3]), items[3], items[5]

    if remove_iob:
      ner_tag = ner_tag.replace("-B", "").replace("-B", "")

    word_dict[key] = word
    ner_dict[key] = ner_tag
  
  return word_dict, ner_dict

In [None]:
def construct_label_dict(flow_lines):
  global GLOBAL_EDGE_COUNT
  recipe_dict = {}
  
  for line in flow_lines:
    GLOBAL_EDGE_COUNT += 1
    items = line.strip().split(" ")

    source_node = (int(items[0]), int(items[1]), int(items[2]))
    dest_node = (int(items[4]), int(items[5]), int(items[6]))

    label = items[3]
    if label == "v":
      label = "v-tm"
    elif label == "s":
      label = "d"

    if source_node < dest_node:
      label += ":LR"
      recipe_dict[(source_node, dest_node)] = label
    else:
      label += ":RL"
      recipe_dict[(dest_node, source_node)] = label
  
  return recipe_dict

In [None]:
RELATION_SET = set()

for ner_file, flow_file in zip(NER_FILES, FLOW_FILES):
  ner_data, flow_data = open(ner_file, "r", encoding="utf-8"), open(flow_file, "r", encoding="utf-8")
  ner_lines, flow_lines = ner_data.readlines(), flow_data.readlines()

  _, ner_dict = construct_recipe_dict(ner_lines)

  for line in flow_lines:
    items = line.strip().split(" ")
    source_key, dest_key = encode_key(items[:3]), encode_key(items[4:])

    relation = ner_dict[source_key] + "->" + ner_dict[dest_key]

    RELATION_SET.add(relation)

  ner_data.close()
  flow_data.close()

In [None]:
with open(PROJECT_DIR + TARGET_CORPUS + "-relation_set.pickle", "wb") as relation_set_file:
    pickle.dump(RELATION_SET, relation_set_file)

In [None]:
def generate_pairs(ner_lines, ner_dict):
  global GLOBAL_INCLUDED_COUNT
  global GLOBAL_REJECTED_COUNT

  positions, pairs = [], []

  for line in ner_lines:
    items = line.split(" ")
    label = items[5].replace("\n", "")

    if "-I" not in label and label != "O":
      position = (int(items[0]), int(items[1]), int(items[2]))
      positions.append(position)

  for i in range(len(positions)):
    for j in range(i+1, len(positions)):
      source_key, dest_key = encode_key(positions[i]), encode_key(positions[j])
      source_ner, dest_ner = ner_dict[source_key], ner_dict[dest_key]

      flow1 = source_ner + "->" + dest_ner
      flow2 = dest_ner + "->" + source_ner

      if flow1 in RELATION_SET or flow2 in RELATION_SET:
        pairs.append((positions[i], positions[j]))
        GLOBAL_INCLUDED_COUNT += 1
      else:
        GLOBAL_REJECTED_COUNT += 1

  return pairs

In [None]:
def construct_sentence(ner_lines, ner_dict, position, target_word_positions, typed=False):
  first_word = True
  sentence = ""

  marking = -1
  marking_ner_tag = None

  for line in ner_lines:
    items = line.split(" ")
    items[0], items[1], items[2] = int(items[0]), int(items[1]), int(items[2])

    if (position[0], position[1]) == (items[0], items[1]):

      if marking != -1:
        word_key = encode_key((items[0], items[1], items[2]))
        
        if marking_ner_tag.replace('-B', '-I') != ner_dict[word_key]:
          sentence += (' </e' + str(marking) + '>')
          marking, marking_ner_tag = -1, None

      if not first_word:
        sentence += " "

      first_word = False
      word_added = False

      for index, target_word_position in enumerate(target_word_positions):
        if target_word_position == (items[0], items[1], items[2]):
          word_key = encode_key(target_word_position)
          marking, marking_ner_tag = index + 1, ner_dict[word_key]

          sentence += ('<e' + str(marking) + '> ' + items[3])
          word_added = True

      if not word_added:
        sentence += items[3]

  if marking != -1:
    sentence += (' </e' + str(marking) + '>')

  return sentence

In [None]:
def check_sentence_sanity(sentence):
  contains_marking = False
  
  for opening_tag, closing_tag in [('<e1>', '</e1>'), ('<e2>', '</e2>')]:
    if opening_tag in sentence:
      contains_marking = True 

      if closing_tag not in sentence:
        raise Exception(closing_tag + " is missing!")

  if not contains_marking:
    raise Exception("no tags found!")

In [None]:
def construct_data():
  first_sentences, second_sentences, labels = [], [], []

  for ner_file, flow_file in zip(NER_FILES, FLOW_FILES):
    ner_data, flow_data = open(ner_file, "r", encoding="utf-8"), open(flow_file, "r", encoding="utf-8")

    ner_lines, flow_lines = ner_data.readlines(), flow_data.readlines()
    
    word_dict, ner_dict = construct_recipe_dict(ner_lines)
    label_dict = construct_label_dict(flow_lines)

    if word_dict.keys() != ner_dict.keys():
      raise Exception("Malformed Word and NER Dictionary")

    word_pairs = generate_pairs(ner_lines, ner_dict)

    for word_pair in word_pairs:
      sentence1 = construct_sentence(ner_lines, ner_dict, position=word_pair[0], target_word_positions=word_pair)
      sentence2 = construct_sentence(ner_lines, ner_dict, position=word_pair[1], target_word_positions=word_pair)

      check_sentence_sanity(sentence1)
      check_sentence_sanity(sentence2)

      if sentence1 == sentence2:
        first_sentences.append(sentence1)
        second_sentences.append(None)
      else:
        first_sentences.append(sentence1)
        second_sentences.append(sentence2)

      if (word_pair[0], word_pair[1]) in label_dict:
        labels.append(label_dict[(word_pair[0], word_pair[1])])
      else:
        labels.append('non-edge')

    ner_data.close()
    flow_data.close()
  
  return first_sentences, second_sentences, labels

In [None]:
first_sentences, second_sentences, labels = construct_data()

In [None]:
np_first_sentences, np_second_sentences, np_labels = np.array(first_sentences), np.array(second_sentences), np.array(labels)

data_matrix = np.column_stack((np_first_sentences, np_second_sentences, np_labels))

df = pd.DataFrame(data_matrix, columns=['First Sentence', 'Second Sentence', 'Label'])

In [None]:
if TARGET_CORPUS != 'r-300':
  # Duplicate a 't-eq:RL' occurrence since there is only one occurrence in the r-100 and r-200 datasets
  row = df.loc[df['Label'] == 't-eq:RL']
  df = df.append(row, ignore_index=True)

In [None]:
def undersample(df, undersample_factor):
  match_indices = df.index[df['Label'] == 'non-edge']
  delete_indices = np.random.choice(match_indices, size=int(len(match_indices) * undersample_factor), replace=False)
  df = df.drop(delete_indices)
  df = df.reset_index(drop=True)
  
  return df

In [None]:
df = undersample(df, UNDERSAMPLE_FACTOR)
df['Label'].value_counts()['non-edge'] / df['Label'].value_counts().sum()

0.7416943521594684

In [None]:
edge_label_list = df['Label'].unique()

In [None]:
dataset = Dataset.from_pandas(df)
ClassLabels = ClassLabel(num_classes=len(edge_label_list), names=list(edge_label_list))
dataset = dataset.class_encode_column("Label", ClassLabels)

Casting to class labels:   0%|          | 0/61404 [00:00<?, ? examples/s]

non-edge        45543
t:RL             4258
t:LR             1859
o:RL             1516
o:LR             1260
d:RL             1125
f-eq:LR          1106
d:LR              951
f-part-of:LR      642
a:LR              635
v-tm:RL           579
t-comp:RL         548
t-eq:LR           314
f-comp:RL         272
a-eq:LR           223
t-part-of:LR      141
t-comp:LR         102
f-part-of:RL       95
v-tm:LR            61
t-part-of:RL       49
a:RL               39
f-eq:RL            33
a-eq:RL            14
f-comp:LR          14
f-set:LR           14
f-set:RL            9
t-eq:RL             2
Name: Label, dtype: int64

In [None]:
dataset = dataset.shuffle(seed=SEED)
split_dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="Label")

corpus_datasets = DatasetDict({
    "train": split_dataset["train"],
    "valid": split_dataset["test"],
})

corpus_datasets.save_to_disk(PROJECT_DIR + 'datasets/' + TARGET_CORPUS + '-entity-marked-flow')

Flattening the indices:   0%|          | 0/49123 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/49123 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/12281 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12281 [00:00<?, ? examples/s]

In [None]:
df['Label'].value_counts().sum() - df['Label'].value_counts()['non-edge'] - GLOBAL_EDGE_COUNT <= 1

True