In [None]:
import json
import unicodedata

from pathlib import Path
from xxhash import xxh64_hexdigest

In [None]:
DATA_DIR = Path("../../data/")

In [None]:
def hash_text(text: str) -> str:
    text = ''.join(text.strip().lower().split())
    return xxh64_hexdigest(text, seed=42)

# RIE

In [None]:
DATASET = "rie"

with open(DATA_DIR / DATASET / "clean" / f"{DATASET}-data.jl", "wt") as fho:
    for input_file in (DATA_DIR / DATASET / "raw").glob("*.markdown"):
        with open(input_file, "rt") as fhi:
            raw_text = [unicodedata.normalize("NFKD", " ".join(line.strip().split())) for line in fhi.readlines() if line.strip()]
    
        for lidx, line in enumerate(raw_text, start=1):
            speaker, text = line.split(" ", 1)
            speaker = speaker.rstrip(":")
            node = {
                "dataset": DATASET,
                "id": hash_text(speaker + text),
                "author": speaker,
                "text": text,
                "metadata": {
                    "file": input_file.name.split(".")[0],
                    "line": lidx
                }
            }

            print(json.dumps(node), file=fho)

# CEPS

In [None]:
DATASET = "ceps"

def get_clean_section_node(node, dataset, document, section, nodeidx, **kwargs):
    return {
        "dataset": dataset,
        "id": hash_text(node["author"] + node["text"]),
        "author": node["author"],
        "text": node["text"],
        "metadata": {
            "nidx": nodeidx,
            "document": document["document"],
            "debate": document["debate"],
            "conclusion": document["conclusion"],
            "section": section,
            **kwargs
        }
    }
        

with open(DATA_DIR / DATASET / "clean" / f"{DATASET}-data.jl", "wt") as fho, open(DATA_DIR / DATASET / "raw" / "ceps.json", "rt") as fhi:
    ceps_raw = json.load(fhi)

    for document in ceps_raw:
        node_idx = 1
        for nodeidx, node in enumerate(document["opening"], start=node_idx):
            clean_node = get_clean_section_node(node, DATASET, document, "opening", nodeidx, stance=node["stance"])
            print(json.dumps(clean_node), file=fho)
        node_idx = nodeidx + 1

        for nodeidx, node in enumerate(document["rebuttal"], start=node_idx):
            clean_node = get_clean_section_node(node, DATASET, document, "rebuttal", nodeidx, stance=node["stance"])
            print(json.dumps(clean_node), file=fho)
        node_idx = nodeidx + 1

        for nodeidx, node in enumerate(document["questions"], start=node_idx):
            clean_node = get_clean_section_node(node, DATASET, document, "questions", nodeidx,
                                                type=node["type"], to=node["to"])
            print(json.dumps(clean_node), file=fho)
        node_idx = nodeidx + 1

        for nodeidx, node in enumerate(document["closing"], start=node_idx):
            clean_node = get_clean_section_node(node, DATASET, document, "closing", nodeidx, stance=node["stance"])
            print(json.dumps(clean_node), file=fho)