In [1]:
import re
from swda import CorpusReader

In [2]:
def parse_history(history_field):
    if history_field == "EMPTY" or history_field.strip() == "":
        return []

    turns = history_field.split("|")
    parsed = []

    for t in turns:
        if ":" not in t:
            continue
        speaker, text = t.split(":", 1)
        parsed.append((speaker.strip(), text.strip()))
    return parsed


def preprocess_ftad_eot(path, prefix):
    dataset = []

    with open(path, "r", encoding="utf8") as f:
        lines = f.readlines()

    for idx, line in enumerate(lines):
        fields = line.strip().split("\t")
        if len(fields) != 3:
            continue

        history_raw, curr_text, label_raw = fields

        # parse previous utterances (continuous opponent utterances)
        history = parse_history(history_raw)

        # infer speaker:
        # the speaker of the last utterance in history is the one producing curr_text
        if len(history) == 0:
            speaker = None
        else:
            speaker = history[-1][0]

        sample = {
            "dialog_id": f"{prefix}_{idx}",
            "turn_index": idx,
            "speaker": speaker,
            "history": history,
            "text": curr_text.strip(),
            "label_eot": int(label_raw),
        }

        dataset.append(sample)

    return dataset

#train = preprocess_ftad_eot("FTAD/train.txt", "FTAD_train")
# val = preprocess_ftad_eot("FTAD/val.txt",   "FTAD_val")
# test = preprocess_ftad_eot("FTAD/test.txt", "FTAD_test")

In [None]:
def clean_swda_text(text):
    text = re.sub(r"{.*?}", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = text.replace("/", "")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def build_full_history(utts, i):
    """
    FTAD-style full dialogue history
    """
    history = []
    for j in range(i):
        history.append((utts[j].caller, clean_swda_text(utts[j].text)))
    return history


def preprocess_swda_eot(swda_root):
    corpus = CorpusReader(swda_root)
    dataset = []

    for trans in corpus.iter_transcripts():
        dialog_id = trans.swda_filename
        utts = sorted(trans.utterances, key=lambda u: u.utterance_index)

        for i, utt in enumerate(utts):
            speaker = utt.caller
            text = clean_swda_text(utt.text)

            # FTAD-like history = ALL previous dialogue context
            history = build_full_history(utts, i)

            # EoT label based on speaker change
            if i < len(utts) - 1:
                next_speaker = utts[i+1].caller
                label_eot = int(next_speaker != speaker)
            else:
                label_eot = 1

            data_point = {
                "dialog_id": dialog_id,
                "turn_index": i,
                "speaker": speaker,
                "history": history,
                "text": text,
                "label_eot": label_eot
            }

            dataset.append(data_point)

    return dataset

In [4]:
ftad_train = preprocess_ftad_eot("FTAD/train.txt", "FTAD_train")
swda_data = preprocess_swda_eot("swda")

transcript 1155
