# Swedish Parliament Process

This notebook process the data coming from the swedish parliament.

In [None]:
import json
import unicodedata

from pathlib import Path
from xxhash import xxh64_hexdigest

In [None]:
def hash_text(text: str) -> str:
    text = "".join(text.strip().lower().split())
    return xxh64_hexdigest(text, seed=42)

In [None]:
DATA_DIR = Path("../../data/swedish-parliament/")
INPUT_DIR = DATA_DIR / "anonymized"
OUTPUT_DIR = DATA_DIR / "clean"

In [None]:
for input_file in INPUT_DIR.glob("*.txt"):
    with open(input_file, "rt", encoding="utf-8-sig") as fh:
        raw_text = [
            unicodedata.normalize("NFKD", " ".join(line.strip().split()))
            for line in fh.readlines()
            if line.strip()
        ]

    json_document = []
    md_document = []

    for line in raw_text:
        speaker, text = line.split(" ", 1)
        json_document.append({"speaker": speaker.rstrip(":"), "text": text})
        md_document.append(f"*{speaker}* {text}")

    with open(OUTPUT_DIR / input_file.name.replace(".txt", ".json"), "wt") as fh:
        json.dump(json_document, fh)

    with open(OUTPUT_DIR / input_file.name.replace(".txt", ".markdown"), "wt") as fh:
        print("\n".join(md_document), file=fh)

In [None]:
def clean_corpus(directory, output_filename, dataset):
    with open(directory / output_filename, "wt") as fho:
        for input_file in sorted((directory / "clean").glob("*.json")):
            with open(input_file, "rt") as fhi:
                data = json.load(fhi)

            for lidx, line in enumerate(data, start=1):
                clean_node = {
                    "dataset": dataset,
                    "id": hash_text(line["speaker"] + line["text"]),
                    "text": line["text"],
                    "metadata": {
                        "debate": input_file.name.replace(".json", ""),
                        "author": line["speaker"],
                        "line": lidx,
                    },
                }
                print(json.dumps(clean_node), file=fho)


clean_corpus(DATA_DIR, "swedish-parliament-data.jl", "swedish-parliament")