# Finding parallel text in the Mambai Language Manual

Input: `Mambai Language Manual.docx`
Output: `mambai_parallel_text.csv`

Requirements:

1. Setup Python requirements: `python3 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt`
2. Put your OPENAI_API_KEY in `.env`
3. Run this notebook


In [7]:
import docx2txt
from dotenv import load_dotenv
import random

# openai api key should be in .env file under OPENAI_API_KEY
load_dotenv()

True

In [9]:
filename = "Mambai Language Manual.docx"

# get text from this docx using docx2txt
text = docx2txt.process(filename)

# start at the Grammar section
text = text[text.index("Mambai has a very simple grammatical structure,") :]

print(f"Total text length: {len(text)}")
print(f"Total words: {len(text.split())}")

Total text length: 144322
Total words: 25560


In [10]:
# from the text, create a string of 500 words at a time, each overalapping with the last by 250 words
from typing import List
from dataclasses import dataclass

words = text.split()
print(f"Total words: {len(words)}")


@dataclass
class Section:
    text: str
    parallel_text: List[str]


sliding_window = 500
overlap = 250
sections = []
for i in range(0, len(words), sliding_window - overlap):
    text = " ".join(words[i : i + sliding_window])
    section = Section(text=text, parallel_text=[])
    sections.append(section)

print(f"Total sections: {len(sections)}")
print(
    f'Total words in all sections combined: {len(" ".join([s.text for s in sections]).split())}'
)

Total words: 25560
Total sections: 103
Total words in all sections combined: 50870


In [12]:
from openai import OpenAI
import os
import json

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


system_prompt = (
    """You are a research assistant in linguistics, rigorously sorting through text."""
)

prompt_template = """The following is extracted from a Mambai language manual:

START
{text}
END

Give me the parallel Mambai/English words or phrases mentioned in this section, in json format. Ignore other text, and ignore phrases that are not parallel.

Example:
START
The following conjunctions are best introduced in context:
Atmen im lao
Im lao hal
Arpi'l im lao
if
if you go
when you go
END

Expected result:
[
  {{"Mambai": "Atmen im lao", "English": "if"}},
  {{"Mambai": "Im lao hal", "English": "if you go"}},
  {{"Mambai": "Arpi'l im lao", "English": "when you go"}}
]
"""

section = sections[0]


def get_section_parallel_text(section):
    if len(section.parallel_text) > 0:
        print("Already have parallel text for this section, skipping")
        return
    prompt = prompt_template.format(text=section.text)

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        # takes over 3 min per section for both!
        model="gpt-4-1106-preview",
        # model="gpt-4"
    )

    result = chat_completion.choices[0].message.content
    if "```" in result:
        # start at the first ```, end at the last ```, and strip out the ```s
        result = result[result.index("```") :]
        result = result[: result.rindex("```")]
        result = result[7:]

    result = json.loads(result)
    section.parallel_text = result
    return result

In [1]:
from tqdm import tqdm

sections_to_process = [s for s in sections if len(s.parallel_text) == 0]

# Takes a few hours, depending on OpenAI response times
# this is idempotent: if you run it again, it will skip sections that already have parallel text
for section in tqdm(sections_to_process):
    get_section_parallel_text(section)

In [16]:
import pickle

with open("mambai_sections.pickle", "wb") as f:
    pickle.dump(sections, f)

# with open("mambai_sections.pickle", "rb") as f:
#     sections = pickle.load(f)

In [30]:
# get a parallel_text in all sections

all_parallel_text = []
for section in sections:
    all_parallel_text.extend(section.parallel_text)

print(f"Total of {len(all_parallel_text)} parallel text items")

Total of 6670 parallel text items


In [31]:
random.sample(all_parallel_text, 10)

Total of 980 lists of strings


[{'Mambai': 'kapé', 'English': 'coffee'},
 {'Mambai': 'Au hakarak kuartu nor kama kid, nei riu-hati nor sentina.',
  'English': 'I want a single room with bath and toilet.'},
 {'Mambai': 'gme adj', 'English': 'yellow'},
 {'Mambai': 'mro', 'English': 'thirsty'},
 {'Mambai': 'gal n', 'English': 'bag'},
 {'Mambai': 'ble kek', 'English': 'awake'},
 {'Mambai': 'Rom akuza urá dêssáp?',
  'English': 'Have you been accused of something?'},
 {'Mambai': 'pun-klao', 'English': 'to damage; to harm,hurt'},
 {'Mambai': 'id', 'English': 'a'},
 {'Mambai': 'mendai', 'English': 'like this, thus, so'}]

In [35]:
# only keep unique items, in a tuple (Mambai, English)
unique_parallel_text = []
for parallel_text in all_parallel_text:
    parallel_tuple = (parallel_text["Mambai"], parallel_text["English"])
    if parallel_tuple not in unique_parallel_text:
        unique_parallel_text.append(parallel_tuple)

print(f"Total of {len(unique_parallel_text)} unique parallel text items")

# filter out elements that don't have a Mambai or English key
unique_parallel_text = [
    parallel_text
    for parallel_text in unique_parallel_text
    if parallel_text[0] != "" and parallel_text[1] != ""
]

print(f"Total of {len(unique_parallel_text)} unique parallel text items")

Total of 5170 unique parallel text items
Total of 5164 unique parallel text items


In [36]:
# save all_parallel_text to a csv
import csv

with open("mambai_parallel_text.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=["Mambai", "English"])
    writer.writeheader()
    writer.writerows(all_parallel_text)