In [11]:
import json
import random
import re
from typing import Dict, List, Union

from tqdm import tqdm

In [2]:
with open("../results/gpt-3.5-turbo-0125/1shot/atomic_2020_dialogues.jsonl") as f:
    data = [json.loads(line) for line in f]

print("# total instances:", len(data))

# total instances: 57049


In [9]:
def parse_dialogue(example: str) -> Dict[str, Union[List[str], str]]:
    """
    Parse a dialogue from a gpt-generated output.

    :param example: Generated output from ChatGPT with the following format:
    A1: I was at the port for 4 hours.
    A2:
    [with time elapsed]
    A: I just arrived at the port now.
    B: What are you going to do?
    A: I'm going to take a walk around the port.
    B: I see. Have fun.
    [4 hours later]
    A: The view was amazing! I took a lot of pictures.

    [without time elapsed]
    A: I'll send you some pictures after I get back.
    :return: (context, time_elapsed, delayed_response, immediate_response)
    """
    dialogue = example.split("[with time elapsed]")[1].strip()
    dialogue = re.sub(r"A\d:", "A:", dialogue)
    dialogue = re.sub(r"B\d:", "B:", dialogue)

    context = "\n".join(dialogue.split("later]\n")[0].split("\n")[:-1]).strip()
    time_elapsed = dialogue.split("later]\n")[0].split("\n")[-1].strip()
    penultimate_speaker = context.split("\n")[-1][0]
    if not penultimate_speaker in ["A", "B"]:
        return None

    delayed_response = dialogue.split("later]\n")[1].split("\n")[0].strip()
    if not delayed_response:
        return None
    target_speaker = delayed_response[0]
    if target_speaker not in ["A", "B"]:
        return None

    if target_speaker == penultimate_speaker:
        context = "\n".join(context.split("\n")[:-1]).strip()
        penultimate_speaker = context.split("\n")[-1][0]
        if not (penultimate_speaker in ["A", "B"] and penultimate_speaker != target_speaker):
            return None

    try:
        immediate_response = dialogue.split("[without time elapsed]\n")[1].strip()
    except IndexError:
        return None
    if not immediate_response:
        return None

    immediate_response = immediate_response.split("\n")[-1].strip()
    if not (immediate_response[0] in ["A", "B"] and immediate_response[0] == target_speaker):
        return None

    # postprocess
    context = context.replace("A: ", "").replace("B: ", "").strip().split("\n")
    time_elapsed = time_elapsed.replace("[", "").replace("{", "").replace("(", "").replace("A:", "").replace("B:", "").strip()
    delayed_response = delayed_response.replace(f"{target_speaker}:", "").strip()
    immediate_response = immediate_response.replace(f"{target_speaker}:", "").strip()

    return {
        "context": context,
        "time_elapsed": time_elapsed,
        "delayed_response": delayed_response,
        "immediate_response": immediate_response
    }

In [10]:
instances = []
for d in data:
    # d[0]: user request
    # d[1]: api response
    output = d[1]["choices"][0]["message"]["content"]

    # filter out malformed responses
    if not (("[with time elapsed]\n" in output) and ("later]\n" in output) and ("[without time elapsed]\n" in output)):
        continue

    instance = parse_dialogue(output)
    if instance is not None:
        instances.append(instance)

print("# filtered instances:", len(instances))

# filtered instances: 36481


In [8]:
instances[0]

{'context': ["Have you ever experienced such a sudden blush? It's so embarrassing.",
  "Oh no, I hope you're feeling better now. How long does it usually last?",
  "Usually just a few minutes, it's slowly fading now.",
  "Take a deep breath, it'll pass."],
 'time_elapsed': 'a few minutes',
 'delayed_response': 'Phew, feeling better now. It was really intense for a moment there.',
 'immediate_response': 'I think the moment has passed, feeling like myself again.',
 'length': 5}

In [12]:
random.shuffle(instances)

In [13]:
train = instances[:int(0.9 * len(instances))]
valid = instances[int(0.9 * len(instances)): int(0.95 * len(instances))]
test = instances[int(0.95 * len(instances)):]

In [14]:
with open("../resources/data/train.json", "w") as f:
    json.dump(train, f, indent=2)

with open("../resources/data/valid.json", "w") as f:
    json.dump(valid, f, indent=2)

with open("../resources/data/test.json", "w") as f:
    json.dump(test, f, indent=2)