# Clean Corpus Compilation

In [1]:
import json

from pathlib import Path
from xxhash import xxh64_hexdigest

In [2]:
SP_DIR = Path("../../data/swedish-parliament/")
TA_DIR = Path("../../data/taejae/")

In [3]:
def hash_text(text: str) -> str:
    text = ''.join(text.strip().lower().split())
    return xxh64_hexdigest(text, seed=42)

In [4]:
def clean_corpus(directory, output_filename, dataset):
    with open(directory / output_filename, "wt") as fho:
        for input_file in sorted((directory / "clean").glob("*.json")):
            with open(input_file, "rt") as fhi:
                data = json.load(fhi)
        
            for lidx, line in enumerate(data, start=1):
                clean_node = {
                    "dataset": dataset,
                    "id": hash_text(line["speaker"] + line["text"]),
                    "author": line["speaker"],
                    "text": line["text"],
                    "metadata": {
                        "debate": input_file.name.replace(".json", ""),
                        "line": lidx
                    }
                }
                print(json.dumps(clean_node), file=fho)

In [5]:
clean_corpus(SP_DIR, "swedish-parliament-data.jl", "swedish-parliament")
clean_corpus(TA_DIR, "taejae-academy-data.jl", "taejae-academy")