# Prompt tuning for translating English > Mambai

Translate an English sentence to Mambai by:

1. Find closest sentences using TF-IDF / semantic embeddings / both
2. Find dictionary entries for words in sentence
3. Construct prompt, with a mix of example sentences and dict entries

TODO:

- Clean up Mambai corpus
  - Some dict entries missing as it relies on font weight, which is not always OCRed correctly
    - others need to be separated (e.g. "sit; live")
  - Some sentences poorly aligned
- Get similar sentences based on syntactic similarity, instead of `get_sentences_starting_with_same_words`


In [None]:
import dotenv

dotenv.load_dotenv()

### Get Mambai corpus, split between sentences and dict entries


In [None]:
import csv
import json
import random

with open("test_leo.json") as f:
    test_data = json.load(f)
print(f"For use in the test set, we have {len(test_data)} sentences.")

with open("mambai_parallel_eng_mgm.csv") as f:
    reader = csv.DictReader(f)
    data = list(reader)


print(f"Total of {len(data)} rows in the dataset.")

train_data = [r for r in data if r["split"] == "train"]

print(f"Total of {len(train_data)} rows in the training set.")


# average number of words in in the data set

avg_words_mgm = sum(len(r["Mambai (mgm)"].split()) for r in data) / len(data)
avg_words_eng = sum(len(r["English (eng)"].split()) for r in data) / len(data)

print(f"Average number of Mambai words per sentence: {avg_words_mgm:.2f}")
print(f"Average number of English words per sentence: {avg_words_eng:.2f}")

In [None]:
# only keep examples from the Mambau Language Manual

print(f"Total of {len(data)} rows in the dataset.")
data = [d for d in data if d["source"].startswith("Mambai Language Manual")]
print(
    f"Total of {len(data)} rows in the dataset after filtering for the Mambai Language Manual."
)


# how many are from the dev + test split?
dev_or_test = [d for d in data if d["split"] in ["dev", "test"]]
print(f"Total of {len(dev_or_test)} rows in the dev or test split.")

In [None]:
# TMP: use all data as train_data

train_data = data

In [None]:
from unidecode import unidecode

keys = ["Mambai (mgm)", "English (eng)"]


def make_lowercase_and_remove_accents(s):
    return unidecode(s[0].lower() + s[1:])


def make_lowercase_and_remove_accents_for_keys(row, keys):
    for key in keys:
        row[key] = make_lowercase_and_remove_accents(row[key])


for row in train_data:
    make_lowercase_and_remove_accents_for_keys(row, keys)
for row in test_data:
    make_lowercase_and_remove_accents_for_keys(row, keys)

random.sample(train_data, 5)

In [None]:
import json

with open("eng_mgm.json") as f:
    dict_entries = json.load(f)

print(f"Total of {len(dict_entries)} entries in the English > Mambai dictionary.")

random.sample(dict_entries, 2)

In [None]:
# experiment tracking in https://docs.google.com/spreadsheets/d/1wP0tDiPqmS8UWNiyY4oSAZn3Mzw2Z4FTZZOQ5lf-NHQ/edit#gid=0

config = {
    "baseline": False,
    "model": "gpt-4",
    "train_rows": len(train_data),
    "retrieval_sentences": {"tfidf": 5, "semantic_laser": 5},
    "retrieval_dict": True,
}

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


def find_top_k_tfidf(sentence, eng_sentences, top_k=5):
    # Ensure the given sentence is included in the list of sentences to compare
    documents = [sentence] + eng_sentences

    # Initialize the TF-IDF Vectorizer and transform the documents into TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    # Compute the cosine similarity between the first document (the given sentence)
    # and all other documents
    cosine_similarities = linear_kernel(tfidf_matrix[0:1], tfidf_matrix).flatten()

    # Find the indices of the top k similarity scores (excluding the first document itself)
    # We add 1 to skip the first document which is the input sentence itself
    top_k_indices = cosine_similarities[1:].argsort()[-top_k:][::-1] + 1

    return [train_data[index - 1] for index in top_k_indices]


# Example usage:
eng_sentences = [row["English (eng)"] for row in train_data]
input_sentence = "we will be sitting there having coffee"

num_to_retrieve = config["retrieval_sentences"]["tfidf"]
top_tfidf = find_top_k_tfidf(input_sentence, eng_sentences, num_to_retrieve)

print(f"Top {num_to_retrieve} similar sentences for '{input_sentence}'")
for sentence in top_tfidf:
    print(sentence["English (eng)"])

### Get LASER encoder, encode English sentences from Mambai corpus


In [None]:
from laser_encoders import LaserEncoderPipeline

encoder = LaserEncoderPipeline(lang="eng_Latn")

embeddings = encoder.encode_sentences([row["English (eng)"] for row in train_data])

### Construct prompt


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

import spacy

nlp = spacy.load("en_core_web_sm")


def find_top_k_semantic_laser(input, top_k=5):
    embedded_input = encoder.encode_sentences([input])
    closest_indices = cosine_similarity(embedded_input, embeddings)[0].argsort()[
        -top_k:
    ][::-1]
    return [train_data[i] for i in closest_indices]


def get_sentences_starting_with_same_words(input):
    input_words = input.split()
    first_two_words = " ".join(input_words[:2])
    for row in train_data:
        if row["English (eng)"].startswith(first_two_words):
            yield row


def get_relevant_dict_entries(sent):
    doc = nlp(sent)
    lemmas = [token.lemma_ for token in doc]
    for lemma in lemmas:
        for row in dict_entries:
            if row["entry"] == lemma:
                yield row

In [None]:
prompt_template = """You are a translator for the Mambai language, originally from Timor-Leste.

{sentences_section}{dict_section}Please provide the translation for the following sentence. Do not provide any explanations or text apart from the translation.

English: {input}
Mambai:"""

baseline_prompt = """You are a translator for the Mambai language, originally from Timor-Leste.

English: Whom have you told about his imprisonment?
Mambai: It tou It kaben ni problema lao sen?

Please provide the translation for the following sentence. Do not provide any explanations or text apart from the translation.

English: {input}
Mambai:"""


def format_prompt(sentences_str, dict_str, input):
    sentences_section = (
        f"### Example sentences ###\n{sentences_str}\n\n" if sentences_str else ""
    )
    dict_section = f"### Dictionary entries ###\n{dict_str}\n\n" if dict_str else ""
    return prompt_template.format(
        sentences_section=sentences_section, dict_section=dict_section, input=input
    )


def get_sentences_str(rows):
    out = ""
    for row in rows:
        out += f"English: {row['English (eng)']}\n"
        out += f"Mambai: {row['Mambai (mgm)']}\n"
        out += "\n"
    return out


def get_dict_str(dict_entries):
    out = ""
    for row in dict_entries:
        out += f"English: {row['entry']}\n"
        out += f"Mambai: {row['definition']}\n"
        out += "\n"
    return out


def get_prompt(input):
    sentences = []
    if "baseline" in config and config["baseline"]:
        return baseline_prompt.format(input=input)
    if "tfidf" in config["retrieval_sentences"]:
        tfidf_count = config["retrieval_sentences"]["tfidf"]
        sentences.extend(find_top_k_tfidf(input, eng_sentences, tfidf_count))
    if "semantic_laser" in config["retrieval_sentences"]:
        semantic_count = config["retrieval_sentences"]["semantic_laser"]
        sentences.extend(find_top_k_semantic_laser(input, semantic_count))
    if config["retrieval_dict"]:
        dict_entries = list(get_relevant_dict_entries(input))
    else:
        dict_entries = []
    return format_prompt(
        sentences_str=get_sentences_str(sentences),
        dict_str=get_dict_str(dict_entries),
        input=input,
    )

In [None]:
print(get_prompt("we will be sitting there having coffee"))

In [None]:
import os
from openai import AsyncOpenAI
import replicate
import re

# nltk: sent
from nltk.tokenize import sent_tokenize

openai_client = AsyncOpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


# assumes REPLICATE_API_TOKEN is set in env
def get_replicate_translation(
    sentence, model="mistralai/mixtral-8x7b-instruct-v0.1"
) -> str:
    output = replicate.run(
        model,
        input={
            "prompt": get_prompt(sentence),
        },
    )
    output = "".join(output).strip()
    sentences = sent_tokenize(output)
    if len(sentences) == 0:
        return ""
    # only keep the first sentence in output, as Mixtral keeps giving explanations despite the prompt asking it not to
    output = sentences[0]
    # if part of the output has "(note: ...)", keep the part of the string before it
    output = re.sub(r"\([nN]ote:.*\)", "", output).strip()
    # if "Mambai: " in translation, keep the part of the string after it
    output = output.split("Mambai: ")[-1]
    return output


async def get_gpt4_translation(sentence) -> str:
    chat_completion = await openai_client.chat.completions.create(
        messages=[{"role": "user", "content": get_prompt(sentence)}],
        model="gpt-4-turbo-preview",
    )
    translation = chat_completion.choices[0].message.content
    # if "Mambai: " in translation, keep the part of the string after it
    translation = translation.split("Mambai: ")[-1]
    return translation

In [None]:
dev_data = [r for r in data if r["split"] == "dev"]
print(f"Total of {len(dev_data)} rows in the validation set.")

In [None]:
import asyncio


async def process_batch_gpt4(sentences):
    tasks = [get_gpt4_translation(sentence) for sentence in sentences]
    return await asyncio.gather(*tasks)


async def process_batch_replicate(sentences, model):
    loop = asyncio.get_running_loop()
    # run_in_executor to run synchronous code in a separate thread
    tasks = [
        loop.run_in_executor(None, get_replicate_translation, sentence, model)
        for sentence in sentences
    ]
    return await asyncio.gather(*tasks)


async def translate_data(dev_data):
    batch_size = 10
    for i in range(0, len(dev_data), batch_size):
        print(f"Processing batch {i+1} to {i+batch_size}")
        batch = dev_data[i : i + batch_size]
        if all("mgm_translation" in row for row in batch):
            print("All batch elements already have the 'mgm_translation' key, skipping")
            continue
        if config["model"].startswith("gpt-4"):
            translations = await process_batch_gpt4(
                [row["English (eng)"] for row in batch]
            )
        elif config["model"] == "mixtral":
            translations = await process_batch_replicate(
                [row["English (eng)"] for row in batch],
                "mistralai/mixtral-8x7b-instruct-v0.1",
            )
        elif config["model"] == "llama":
            translations = await process_batch_replicate(
                [row["English (eng)"] for row in batch],
                "meta/llama-2-70b-chat",
            )
        # for each row, add the translation under key 'mgm_translation'
        for row, translation in zip(batch, translations):
            row["mgm_translation"] = make_lowercase_and_remove_accents(translation)

In [None]:
# Function to run asyncio tasks from Jupyter Notebook
async def run(coroutine):
    try:
        # Attempt to get the running event loop
        loop = asyncio.get_running_loop()
    except RuntimeError:  # If no running event loop
        loop = asyncio.new_event_loop()  # Create a new loop
        asyncio.set_event_loop(loop)
    return await coroutine


await run(translate_data(test_data))

In [None]:
import evaluate

chrf = evaluate.load("chrf")
bleu = evaluate.load("bleu")

predictions = [row["mgm_translation"] for row in test_data]
references = [[row["Mambai (mgm)"]] for row in test_data]

# Calculate metrics
bleu_results = bleu.compute(predictions=predictions, references=references)
chrf_results = chrf.compute(predictions=predictions, references=references)
chrfpp_results = chrf.compute(
    predictions=predictions, references=references, word_order=2
)

print(f"BLEU score: {bleu_results['bleu']}")
print(f"ChrF score: {chrf_results['score']}")
print(f"Chrf++ score: {chrfpp_results['score']}")

In [None]:
# valid set

valid_data = [row for row in data if row["split"] == "test"]
print(f"Total of {len(valid_data)} rows in the validation set.")

random.sample(valid_data, 2)