# Finding parallel text in the Mambai Language Manual through PDFMiner

- Inputs: `Mambai to English dict.docx`, `English to Mambai dict.docx`
- Outputs: dictionaries `mgm_eng.json`, `eng_mgm.json`

Requirements:

1. Setup Python requirements: `python3 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt`
2. Run this notebook


In [3]:
preds, refs = [
    "almeida du santus jame du santus jeamen",
    "niapreni bua tototu sine mai ubuka no transforma nia sai ema nee be ia valor kadidade",
    "loska sala se los siradn los sala deansira bel hadia",
    "ita aberta kole aberta ba sirab hodi nune sira mas bele komprende sira mose bele hala",
    "ita timor dia klivita tur hatur no nok atu hasilesion ne te inpase politikeda ne hodi hare dezenvolvimentu nasional no hasoru se karik buat ne akontesia ita nerai",
    "agora fakata dalan ba mi bobu kikina kasian",
    "sasan nebe agora ami uza ne ida neebe mak  sira sames lorio ne mai iha fula kotuk neebe ami uza sito agora",
    "hantanin liki rasi kosar na hakee mu",
    "diskulfami en komodaitu<pad>",
    "haufiar hane maunbotsananahaufiar sn pursentuhaufiar ma maun bot sa nana serake i haoko se ne bele ga lae<pad>",
    "hanesan iha batu lari neba ne emania aman saiusin natar laran maidean covid laimos pulisi baba brikatin seriak covid nebitataunsa<pad>",
    "ami buko ki pova kik na balin futur<pad>",
    "ho volumi mota neebe bot hakotu hotu kanu kanalizasaun be mos i hihaparte maloanian<pad>",
    "importante haturba tutu hotu ho komitmentu diakontade bot atu hala mudansa<pad>",
    "<pad>",
    "maisr nain tolu hanuin katakokt iha si iha mrafat<pad>",
], [
    "almeido dos santos jaime dos santos gmn",
    "nia aprende buat hotuhotu husi nia mai eduka no transforma nia sai ema neebe iha valor karidade",
    "loos ka sala se loos sira dehan loos sala dehan sira bele hadia",
    "ita aberta koalia aberta ba sira hodi nunee sira mos bele komprende sira mos bele halao",
    "ita timor diak liu ita tuur ita tuur nonook atu solusiona impasse politika ida nee hodi haree dezenvolvimentu nasional no hasoru se karik buat nee akontese iha itania rai",
    "agora taka dalan ba ami povu kiik nee kasihan",
    "sasan neebe agora ami uza nee ida neebe maka sira sames lori ona mai iha fulan kotuk neebe ami uza sei too agora",
    "ami paling liki rasik kosar par han hemu",
    "deskulpa ami inkomoda uitoan",
    "hau fiar haunia maun boot xanana hau fiar cen porsentu hau fiar mak maun boot xanana serake iha okos nee bele ka lae",
    "hanesan iha uatolari neeba nee ema nia aman sai husi natar laran mai dehan covid laiha mos polisia ba obriga tenke ser iha covid neebe ita atu halo nuusa",
    "ami povu kiik nee paling futuru",
    "ho volume mota neebe boot hakotu hotu kanu kanalizasaun bee moos iha parte maloha nian",
    "importante hatuur buat hotuhotu ho kometimentu no iha vontade boot atu halo mudansa",
    "la iha buat ida",
    "mais sira nain tolu hanoin katak covid iha sei iha nafatin",
]

for pred, ref in zip(preds, refs):
    print(pred)
    print(ref)
    print()

almeida du santus jame du santus jeamen
almeido dos santos jaime dos santos gmn

niapreni bua tototu sine mai ubuka no transforma nia sai ema nee be ia valor kadidade
nia aprende buat hotuhotu husi nia mai eduka no transforma nia sai ema neebe iha valor karidade

loska sala se los siradn los sala deansira bel hadia
loos ka sala se loos sira dehan loos sala dehan sira bele hadia

ita aberta kole aberta ba sirab hodi nune sira mas bele komprende sira mose bele hala
ita aberta koalia aberta ba sira hodi nunee sira mos bele komprende sira mos bele halao

ita timor dia klivita tur hatur no nok atu hasilesion ne te inpase politikeda ne hodi hare dezenvolvimentu nasional no hasoru se karik buat ne akontesia ita nerai
ita timor diak liu ita tuur ita tuur nonook atu solusiona impasse politika ida nee hodi haree dezenvolvimentu nasional no hasoru se karik buat nee akontese iha itania rai

agora fakata dalan ba mi bobu kikina kasian
agora taka dalan ba ami povu kiik nee kasihan

sasan nebe agora 

In [56]:
from dataclasses import dataclass
import re
import json


@dataclass
class DictionaryEntry:
    entry: str
    definition: str
    part_of_speech: str = None


@dataclass
class Dictionary:
    entries: list[DictionaryEntry]

    def save_as_json(self, path):
        with open(path, "w") as f:
            json.dump([entry.__dict__ for entry in self.entries], f, indent=2)

    def load_from_json(self, path):
        with open(path, "r") as f:
            entries = json.load(f)
            self.entries = [DictionaryEntry(**entry) for entry in entries]


def parse_key(key: str):
    keys = [k.strip() for k in key.split(", ")]
    return keys


def parse_value(value: str):
    """Parse a raw value into a list of definitions, each with a part of speech."""
    value = value.replace("\n", " ")
    # split to get multiple definitions
    values = re.split(r";|,", value)
    for value in values:
        # extract part of speech from the value, if present: max 3 letters, ends with a '.', e.g. "adj." or "n."
        value = value.strip(". ")
        match = re.match(r"([a-z]{1,3}\.)(.+)$", value)
        if match:
            part_of_speech = match.group(1).strip()
            value = match.group(2).strip()
            yield {"part_of_speech": part_of_speech, "value": value}
        else:
            yield {"part_of_speech": None, "value": value}


assert list(parse_value("n. person")) == [{"part_of_speech": "n.", "value": "person"}]
assert list(parse_value("n. (artificial) light.")) == [
    {"part_of_speech": "n.", "value": "(artificial) light"}
]
assert list(parse_value("adj. short; shallow.")) == [
    {"part_of_speech": "adj.", "value": "short"},
    {"part_of_speech": None, "value": "shallow"},
]

In [71]:
from docx import Document


def extract_raw_data_from_docx(docx_path) -> dict:
    doc = Document(docx_path)
    data = {}
    current_key = None

    for paragraph in doc.paragraphs:
        for run in paragraph.runs:
            if run.bold:
                if current_key:
                    # Finalize the previous entry
                    data[current_key] = data.get(current_key, "").strip()
                current_key = run.text
                data[current_key] = ""
            else:
                if current_key:
                    data[current_key] += run.text

    # Finalizing the last entry
    if current_key:
        data[current_key] = data.get(current_key, "").strip()

    return data


def get_dictionary_from_raw_data(raw_data: dict) -> Dictionary:
    dictionary = Dictionary([])

    for key, value in raw_data.items():
        keys = parse_key(key)
        values = list(parse_value(value))
        # create a new entry for each key/value combination
        for key in keys:
            for value in values:
                part_of_speech = value["part_of_speech"]
                value = value["value"]
                if key and value:
                    dictionary.entries.append(
                        DictionaryEntry(
                            entry=key, definition=value, part_of_speech=part_of_speech
                        )
                    )
    return dictionary


def get_dictionary(path):
    raw_data = extract_raw_data_from_docx(file_path)
    dictionary = get_dictionary_from_raw_data(raw_data)
    return dictionary

In [72]:
file_path = "/Users/raphaelmerx/Downloads/Mambai/Mambai to English dict.docx"
mgm_eng_dictionary = get_dictionary(file_path)
mgm_eng_dictionary.save_as_json("mgm_eng.json")

In [73]:
file_path = "/Users/raphaelmerx/Downloads/Mambai/English to Mambai dict.docx"
eng_mgm_dictionary = get_dictionary(file_path)
eng_mgm_dictionary.save_as_json("eng_mgm.json")

import random

random.sample(eng_mgm_dictionary.entries, 10)

[DictionaryEntry(entry='too', definition='~', part_of_speech=None),
 DictionaryEntry(entry="p√¥s\nlet's", definition='ma', part_of_speech=None),
 DictionaryEntry(entry='indeed', definition='didi', part_of_speech=None),
 DictionaryEntry(entry='witch', definition='sabu', part_of_speech=None),
 DictionaryEntry(entry='a ~,', definition='(after a vowel) kene', part_of_speech=None),
 DictionaryEntry(entry='help', definition='ajuda', part_of_speech=None),
 DictionaryEntry(entry='blue', definition='moro', part_of_speech=None),
 DictionaryEntry(entry='bother', definition='dlai', part_of_speech=None),
 DictionaryEntry(entry='baby', definition='an-koso', part_of_speech=None),
 DictionaryEntry(entry='feather', definition='man-hulu', part_of_speech=None)]

In [74]:
# build mgm-eng.stem.dic for hunalign
# note that eng is target but format with be "eng_value @ mgm_key"

with open("mgm-eng.stem.dic", "w") as f:
    for entry in mgm_eng_dictionary.entries:
        f.write(f"{entry.definition} @ {entry.entry}\n")