In [None]:
import csv
import pandas as pd

# Data preprocessing (if needed)

In [None]:
string_name_data = 'gpt2_without_finetune_first_sentence' # change this

start = 300 # change this
end = 399 # change this

df = pd.read_csv(f'{string_name_data}.tsv', sep='\t', names = ['A', 'B'])

df = df[start: end+1] # if we want to divide the work of one output, then adjust this number

df

Unnamed: 0,A,B
300,PersonX has some problems xEffect [GEN],xLabel [CHANGED] xLabel [CHANGED] xLabel [CHA...
301,PersonX dances at the party xReact [GEN],X
302,PersonX and PersonY live nearby xReact [GEN],is using a modified X3
303,PersonX thinks of someone else's bad decision ...,is too smart to tell his own story
304,PersonX finds a role model HinderedBy [GEN],
...,...,...
395,PersonX is very confident xReact [GEN],has a great performance: http://puu
396,PersonX just married PersonY xIntent [GEN],xIntent_x = Gen [2] xIntent_y = PersonX xInte...
397,PersonX wants to visit the zoo xNeed [GEN],[FADE] The game will be about playing a girl ...
398,PersonX receives and email xEffect [GEN],


In [None]:
# declare relations
# for now we only define for these 7 relations. Relation which does not appear in this
# will be omitted. 

relations = {
    "HinderedBy": ', can be hindered by, ',
    "xNeed": " but before, PersonX needed ",
    "xWant": ", as a result, PersonX wants ",
    "xIntent": " because PersonX wanted ",
    "xReact": ", as a result, PersonX feels ",
    "xAttr": " so, PersonX is seen as ",
    "xEffect": ", as a result, PersonX ",
}

In [None]:
def name_PX_PY(s):
    return s.replace('PersonX', 'Alice').replace('PersonY', 'Bob')

In [None]:
# preferred data format (json)

"""
data = {
  phrase1: {
    "A": ...,
    "B": ...,
    "R": ...,
  }
}
"""

data = {}

for i in range(start, end+1):
  sample = df.loc[i].tolist()
  split_strings = sample[0].split(" ")
  rela_type = split_strings[-3]
  if rela_type not in relations:
    continue

  sample_ind = "phrase" + str(i)
  data[sample_ind] = {}
  data[sample_ind]['B'] = sample[1]
  data[sample_ind]['R'] = relations[rela_type]
  data[sample_ind]['A'] = " ".join(split_strings[:len(split_strings)-4])
  data[sample_ind]['label'] = 0
  data[sample_ind]['A_origin'] = sample[0]
  data[sample_ind]['B_origin'] = sample[1]

In [None]:
data

{'phrase300': {'B': 'nfusing  I was running off of the main menu   and trying the mouse to see how I was able to jump. I was able to move all over this area',
  'R': ', as a result, PersonX ',
  'A': 'PersonX has some problems',
  'label': 0,
  'A_origin': 'PersonX has some problems  xEffect  [GEN]',
  'B_origin': 'nfusing  I was running off of the main menu   and trying the mouse to see how I was able to jump. I was able to move all over this area'},
 'phrase301': {'B': nan,
  'R': ', as a result, PersonX feels ',
  'A': 'PersonX dances at the party',
  'label': 0,
  'A_origin': 'PersonX dances at the party  xReact  [GEN]',
  'B_origin': nan},
 'phrase302': {'B': 'this to each xGen for X [GEN] and xAchievements  [GEN] : Add one entry to each xGen for the next xGen and',
  'R': ', as a result, PersonX feels ',
  'A': 'PersonX and PersonY live nearby',
  'label': 0,
  'A_origin': 'PersonX and PersonY live nearby  xReact  [GEN]',
  'B_origin': 'this to each xGen for X [GEN] and xAchievem

# Hand labeling

## Handling interruption

To prevent the scenario that we need to relabel from the start if the kernel is interrupted, for each phrase of the data, a label index is assigned. They are initially 0, but when we finish labeling on sentence, it will be set to 1. 

In [None]:
# Do not rerun this unless you want to label from the start

labels = {
    "always/often": 0,
    "sometimes/likely": 0,
    "farfetched/never": 0,
    "invalid": 0,
    "too unfamiliar to judge": 0,
}

categories = ["always/often",
    "sometimes/likely",
    "farfetched/never",
    "invalid",
    "too unfamiliar to judge",
]

new_data = {
    "A": [],
    "B": [],
    "label": [],
}

In [None]:
# Program to assist hand-labeling of questions
# Rerun this cell if interrupted. DO NOT rerun the above cell which tracks the labeling, otherwise it will reset.

changename = False # change this for dataset 6
count = 1
for phrase in data:
  if data[phrase]['label'] == 1:
    continue
  if changename:
    print(name_PX_PY(f"Phrase {count}: " + f"\x1b[34m{data[phrase]['A']}\x1b[0m" + data[phrase]['R'] + f"\x1b[35m{data[phrase]['B']}\x1b[0m"))
  else:
    print(f"Phrase {count}: " + f"\x1b[34m{data[phrase]['A']}\x1b[0m" + data[phrase]['R'] + f"\x1b[35m{data[phrase]['B']}\x1b[0m")
  print("\x1b[47;1mHow often does the assertion hold true?\x1b[0m")
  print("0: always/often, 1: sometimes/likely, 2: farfetched/never, 3: invalid, 4: too unfamiliar to judge")
  res = input("Enter your evaluation: ")
  labels[categories[int(res)]] += 1
  data[phrase]['label'] = 1
  new_data['A'].append(data[phrase]['A_origin'])
  new_data['B'].append(data[phrase]['B_origin'])
  new_data['label'].append('yes' if int(res) < 2 else 'no')
  count += 1

Phrase 1: [34mPersonX has some problems[0m, as a result, PersonX [35mnfusing  I was running off of the main menu   and trying the mouse to see how I was able to jump. I was able to move all over this area[0m
[47;1mHow often does the assertion hold true?[0m
0: always/often, 1: sometimes/likely, 2: farfetched/never, 3: invalid, 4: too unfamiliar to judge
Phrase 2: [34mPersonX dances at the party[0m, as a result, PersonX feels [35mnan[0m
[47;1mHow often does the assertion hold true?[0m
0: always/often, 1: sometimes/likely, 2: farfetched/never, 3: invalid, 4: too unfamiliar to judge
Phrase 3: [34mPersonX and PersonY live nearby[0m, as a result, PersonX feels [35mthis to each xGen for X [GEN] and xAchievements  [GEN] : Add one entry to each xGen for the next xGen and[0m
[47;1mHow often does the assertion hold true?[0m
0: always/often, 1: sometimes/likely, 2: farfetched/never, 3: invalid, 4: too unfamiliar to judge
Phrase 4: [34mPersonX thinks of someone else's bad decisio

In [None]:
# Extract labelings

print(labels)

{'always/often': 28, 'sometimes/likely': 36, 'farfetched/never': 25, 'invalid': 8, 'too unfamiliar to judge': 3}


In [None]:
new_df = pd.DataFrame(new_data)

new_df.to_csv(f'{string_name_data}_with_label_from_{start}_to_{end}.tsv', sep = '\t')