# Pilots Transformations

This is the notebook that process and transforms the Pilots data.

In [None]:
import json
import unicodedata

from pathlib import Path
from xxhash import xxh64_hexdigest

In [None]:
DATA_DIR = Path("../../data/")

In [None]:
def hash_text(text: str) -> str:
    text = "".join(text.strip().lower().split())
    return xxh64_hexdigest(text, seed=42)

# CEPS

In [None]:
DATASET = "ceps"


def get_clean_section_node(node, dataset, document, section, nodeidx, **kwargs):
    return {
        "dataset": dataset,
        "id": hash_text(node["author"] + node["text"]),
        "text": node["text"],
        "metadata": {
            "nidx": nodeidx,
            "author": node["author"],
            "document": document["document"],
            "debate": document["debate"],
            "conclusion": document["conclusion"],
            "section": section,
            **kwargs,
        },
    }


with (
    open(DATA_DIR / DATASET / f"{DATASET}-data.jl", "wt") as fho,
    open(DATA_DIR / DATASET / "clean" / "ceps.json", "rt") as fhi,
):
    ceps_raw = json.load(fhi)

    for document in ceps_raw:
        node_idx = 1
        for nodeidx, node in enumerate(document["opening"], start=node_idx):
            clean_node = get_clean_section_node(
                node, DATASET, document, "opening", nodeidx, stance=node["stance"]
            )
            print(json.dumps(clean_node), file=fho)
        node_idx = nodeidx + 1

        for nodeidx, node in enumerate(document["rebuttal"], start=node_idx):
            clean_node = get_clean_section_node(
                node, DATASET, document, "rebuttal", nodeidx, stance=node["stance"]
            )
            print(json.dumps(clean_node), file=fho)
        node_idx = nodeidx + 1

        for nodeidx, node in enumerate(document["questions"], start=node_idx):
            clean_node = get_clean_section_node(
                node, DATASET, document, "questions", nodeidx, type=node["type"], to=node["to"]
            )
            print(json.dumps(clean_node), file=fho)
        node_idx = nodeidx + 1

        for nodeidx, node in enumerate(document["closing"], start=node_idx):
            clean_node = get_clean_section_node(
                node, DATASET, document, "closing", nodeidx, stance=node["stance"]
            )
            print(json.dumps(clean_node), file=fho)

# RIE

In [None]:
DATASET = "rie"

with open(DATA_DIR / DATASET / f"{DATASET}-data.jl", "wt") as fho:
    for input_file in (DATA_DIR / DATASET / "clean").glob("*.markdown"):
        with open(input_file, "rt") as fhi:
            raw_text = [
                unicodedata.normalize("NFKD", " ".join(line.strip().split()))
                for line in fhi.readlines()
                if line.strip()
            ]

        for lidx, line in enumerate(raw_text, start=1):
            speaker, text = line.split(" ", 1)
            speaker = speaker.rstrip(":")
            node = {
                "dataset": DATASET,
                "id": hash_text(speaker + text),
                "text": text,
                "metadata": {
                    "file": input_file.name.split(".")[0],
                    "author": speaker,
                    "line": lidx,
                },
            }

            print(json.dumps(node), file=fho)

# Taejae Academy

In [None]:
import cssutils

from bs4 import BeautifulSoup

In [None]:
for input_file in (DATA_DIR / "taejae" / "raw").glob("*.html"):
    with open(input_file, "rt") as fh:
        raw_data = fh.read()
        soup = BeautifulSoup(raw_data, "html.parser")

    css = cssutils.parseString(soup.select("style")[0].encode_contents())
    speaker_classes = set()
    for rule in css:
        if (
            rule.type == rule.STYLE_RULE
            and rule.selectorText.startswith(".c")
            and rule.style.fontWeight == "700"
        ):
            speaker_classes.add(rule.selectorText[1:])

    document_structure = []
    for paragraph in soup.find_all("p"):
        document_structure.append(
            {
                "text": unicodedata.normalize("NFKD", " ".join(paragraph.text.strip().split())),
                "is_name": paragraph.find("span", class_=speaker_classes),
            }
        )

    parsed_document = []
    for paragraph in document_structure:
        if paragraph["is_name"]:
            parsed_document.append({"speaker": paragraph["text"]})
        elif len(parsed_document) == 0:
            parsed_document.append({"speaker": "N/A", "text": paragraph["text"]})
        else:
            if "text" not in parsed_document[-1]:
                parsed_document[-1]["text"] = paragraph["text"]
            else:
                parsed_document[-1]["text"] += " " + paragraph["text"]

    with open(
        DATA_DIR / "taejae" / "clean" / input_file.name.replace(".html", ".json"), "wt"
    ) as fh:
        json.dump(parsed_document, fh)

In [None]:
with open(DATA_DIR / "taejae" / "taejae-academy-data.jl", "wt") as fho:
    for input_file in sorted((DATA_DIR / "taejae" / "clean").glob("*.json")):
        with open(input_file, "rt") as fhi:
            data = json.load(fhi)

        for lidx, line in enumerate(data, start=1):
            clean_node = {
                "dataset": "taejae-academy",
                "id": hash_text(line["speaker"] + line["text"]),
                "text": line["text"],
                "metadata": {
                    "debate": input_file.name.replace(".json", ""),
                    "author": line["speaker"],
                    "line": lidx,
                },
            }
            print(json.dumps(clean_node), file=fho)