# Finding parallel text in the Mambai Language Manual through PDFMiner

- Inputs: `Mambai to English dict.docx`, `English to Mambai dict.docx`
- Outputs: dictionaries `mgm_eng.json`, `eng_mgm.json`

Requirements:

1. Setup Python requirements: `python3 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt`
2. Run this notebook


In [56]:
from dataclasses import dataclass
import re
import json


@dataclass
class DictionaryEntry:
    entry: str
    definition: str
    part_of_speech: str = None


@dataclass
class Dictionary:
    entries: list[DictionaryEntry]

    def save_as_json(self, path):
        with open(path, "w") as f:
            json.dump([entry.__dict__ for entry in self.entries], f, indent=2)

    def load_from_json(self, path):
        with open(path, "r") as f:
            entries = json.load(f)
            self.entries = [DictionaryEntry(**entry) for entry in entries]


def parse_key(key: str):
    keys = [k.strip() for k in key.split(", ")]
    return keys


def parse_value(value: str):
    """Parse a raw value into a list of definitions, each with a part of speech."""
    value = value.replace("\n", " ")
    # split to get multiple definitions
    values = re.split(r";|,", value)
    for value in values:
        # extract part of speech from the value, if present: max 3 letters, ends with a '.', e.g. "adj." or "n."
        value = value.strip(". ")
        match = re.match(r"([a-z]{1,3}\.)(.+)$", value)
        if match:
            part_of_speech = match.group(1).strip()
            value = match.group(2).strip()
            yield {"part_of_speech": part_of_speech, "value": value}
        else:
            yield {"part_of_speech": None, "value": value}


assert list(parse_value("n. person")) == [{"part_of_speech": "n.", "value": "person"}]
assert list(parse_value("n. (artificial) light.")) == [
    {"part_of_speech": "n.", "value": "(artificial) light"}
]
assert list(parse_value("adj. short; shallow.")) == [
    {"part_of_speech": "adj.", "value": "short"},
    {"part_of_speech": None, "value": "shallow"},
]

In [71]:
from docx import Document


def extract_raw_data_from_docx(docx_path) -> dict:
    doc = Document(docx_path)
    data = {}
    current_key = None

    for paragraph in doc.paragraphs:
        for run in paragraph.runs:
            if run.bold:
                if current_key:
                    # Finalize the previous entry
                    data[current_key] = data.get(current_key, "").strip()
                current_key = run.text
                data[current_key] = ""
            else:
                if current_key:
                    data[current_key] += run.text

    # Finalizing the last entry
    if current_key:
        data[current_key] = data.get(current_key, "").strip()

    return data


def get_dictionary_from_raw_data(raw_data: dict) -> Dictionary:
    dictionary = Dictionary([])

    for key, value in raw_data.items():
        keys = parse_key(key)
        values = list(parse_value(value))
        # create a new entry for each key/value combination
        for key in keys:
            for value in values:
                part_of_speech = value["part_of_speech"]
                value = value["value"]
                if key and value:
                    dictionary.entries.append(
                        DictionaryEntry(
                            entry=key, definition=value, part_of_speech=part_of_speech
                        )
                    )
    return dictionary


def get_dictionary(path):
    raw_data = extract_raw_data_from_docx(file_path)
    dictionary = get_dictionary_from_raw_data(raw_data)
    return dictionary

In [72]:
file_path = "/Users/raphaelmerx/Downloads/Mambai/Mambai to English dict.docx"
mgm_eng_dictionary = get_dictionary(file_path)
mgm_eng_dictionary.save_as_json("mgm_eng.json")

In [73]:
file_path = "/Users/raphaelmerx/Downloads/Mambai/English to Mambai dict.docx"
eng_mgm_dictionary = get_dictionary(file_path)
eng_mgm_dictionary.save_as_json("eng_mgm.json")

import random

random.sample(eng_mgm_dictionary.entries, 10)

[DictionaryEntry(entry='too', definition='~', part_of_speech=None),
 DictionaryEntry(entry="pôs\nlet's", definition='ma', part_of_speech=None),
 DictionaryEntry(entry='indeed', definition='didi', part_of_speech=None),
 DictionaryEntry(entry='witch', definition='sabu', part_of_speech=None),
 DictionaryEntry(entry='a ~,', definition='(after a vowel) kene', part_of_speech=None),
 DictionaryEntry(entry='help', definition='ajuda', part_of_speech=None),
 DictionaryEntry(entry='blue', definition='moro', part_of_speech=None),
 DictionaryEntry(entry='bother', definition='dlai', part_of_speech=None),
 DictionaryEntry(entry='baby', definition='an-koso', part_of_speech=None),
 DictionaryEntry(entry='feather', definition='man-hulu', part_of_speech=None)]

In [74]:
# build mgm-eng.stem.dic for hunalign
# note that eng is target but format with be "eng_value @ mgm_key"

with open("mgm-eng.stem.dic", "w") as f:
    for entry in mgm_eng_dictionary.entries:
        f.write(f"{entry.definition} @ {entry.entry}\n")