In [None]:
import random
import csv
import json
import time
from typing import List, Literal, Optional
from tqdm import tqdm
import spacy
import os

from sacrebleu import corpus_chrf

from dotenv import load_dotenv
load_dotenv()

from dataclasses import dataclass

from translator_utils import Translator, Glossary, Line, LineManager, Message
from gemini_aistudio import GenerativeModel


MODE: Literal["translate", "post-edit"] = "post-edit"
ZERO_SHOT: bool = False

if MODE == "post-edit":
    # gemini = GenerativeModel(system_instruction="You are a linguist helping to post-edit translations from English to Tetun. Candidate translations are provided by Google Translate, and you are asked to correct them, if necessary, using examples and glossary entries. Only use examples and glossary entries to correct the translations.")
    system_instruction = "You are an expert translator. I am going to give you relevant glossary entries, and relevant past translations, where the first is the English source, the second is a machine translation of the English to Tetun, and the third is the Tetun reference translation. The sentences will be written English: <sentence> MT: <machine translated sentence> Tetun: <translated sentence>. After the example pairs, I am going to provide another sentence in English and its machine translation, and I want you to translate it into Tetun. Give only the translation, and no extra commentary, formatting, or chattiness. Translate the text from English to Tetun."

elif MODE == "translate":
    if ZERO_SHOT:
        system_instruction = "You are an expert translator. I am going to give you text in English, and would like you to translate it to Tetun. Give only the translation, and no extra commentary, formatting, or chattiness. Translate the text from English to Tetun."
    else:
        system_instruction = "You are an expert translator. I am going to give you some example pairs of text snippets where the first is in English and the second is a translation of the first snippet into Tetun. The sentences will be written English: <first sentence> Tetun: <translated first sentence> After the example pairs, I am going to provide another sentence in English and I want you to translate it into Tetun. Give only the translation, and no extra commentary, formatting, or chattiness. Translate the text from English to Tetun."

gemini = GenerativeModel(system_instruction=system_instruction)

TRANSLATE_WITH: Literal["google", "madlad", "opusmt"] = "madlad"

In [None]:

line_manager = LineManager.load_from_csv()
train_lines, test_lines = (line_manager.train_lines, line_manager.test_lines)
# train_lines = LineManager.load_from_csv(filename='datafiles/tetun_parallel.csv').lines
# test_lines = LineManager.load_from_csv(filename='datafiles/parallel_lines_A2.csv').lines
print(f"Total of {len(train_lines)} train lines and {len(test_lines)} test lines loaded.")

# Initialize translator with BM25
translator = Translator(translate_with=TRANSLATE_WITH)
translator.init_bm25(train_lines)

# Load glossary
glossary = Glossary()
glossary.load_entries()

#### GT + Gemini

In [None]:

if TRANSLATE_WITH == "google":
    for l in test_lines:
        l.tgt_pred_google = translator.translate(l.en)

if MODE == 'post-edit':
    pred_key = f'tgt_pred_{TRANSLATE_WITH}'


    mt_chrf = corpus_chrf(
        [getattr(l, pred_key) for l in test_lines],
        [[l.tgt for l in test_lines]],
        word_order=2,
    )
    print(f"CHRF for MT: {mt_chrf.score:.2f}")

In [None]:
import random
import litellm

random.seed(42)

train_sample = random.sample(train_lines, 10)

def format_messages_for_gemini(messages: List[Message]) -> List[dict]:
    return [{
        "role": "user" if message.role == "user" else "model",
        "parts": [message.content]
    } for message in messages if message.role != 'system']

def get_post_edited_translation_gemini(input_text: str) -> str:
    # get glossary entries and similar sentences
    glossary_entries = glossary.get_entries(input_text)
    similar_sentences = translator.get_top_similar_sentences_bm25(input_text, top_n=10)

    messages = translator.construct_prompt_post_edit(
        input_text, 
        similar_sentences,
        glossary_entries=glossary_entries,
    )

    print(messages[0].content)
    messages = format_messages_for_gemini(messages)

    response = gemini.generate_content(
        messages,
    )
    return response.strip()

def get_final_translation_gemini(input_text: str) -> str:
    messages = translator.construct_prompt_translation(
        input_text,
        train_sample if not ZERO_SHOT else [],
        system_instruction=system_instruction,
    )

    response = litellm.completion(
        model="gemini/gemini-2.0-flash",
        messages=[m.to_dict() for m in messages],
        temperature=0.5,
    )
    return response.choices[0].message.content

In [None]:
# print(f'src: {test_lines[1].en}')
# print(f'pred: {get_final_translation_gemini(test_lines[1].en)}')
print(get_post_edited_translation_gemini("Always check burn again a couple of hours after first assessment unless burn has been dressed"))

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

MAX_LINES = None

if MODE == 'post-edit':
    def process_line(line):
        if not getattr(line, 'tgt_pred_post_edited', None):
            line.tgt_pred_post_edited = get_post_edited_translation_gemini(line.en)
        return line
elif MODE == 'translate':
    def process_line(line):
        if not getattr(line, 'tgt_pred_gemini', None):
            line.tgt_pred_gemini = get_final_translation_gemini(line.en)
        return line

with ThreadPoolExecutor(max_workers=10) as executor:
    if MAX_LINES:
        futures = [executor.submit(process_line, line) for line in test_lines[:MAX_LINES]]
    else:
        futures = [executor.submit(process_line, line) for line in test_lines]
    
    for future in tqdm(as_completed(futures), total=len(test_lines)):
        try:
            future.result()
        except Exception as e:
            print(f"Error processing line: {e}")

In [None]:

def chrf_for_key(key: str):
    chrf = corpus_chrf(
        [getattr(l, key) for l in test_lines],
        [[l.tgt for l in test_lines]],
        word_order=2,
    )
    return chrf.score

if MODE == 'post-edit':
    mt_chrf = chrf_for_key(pred_key)
    print(f"CHRF for MT: {mt_chrf:.2f}")

    ape_chrf = chrf_for_key('tgt_pred_post_edited')
    print(f"CHRF for APE: {ape_chrf:.2f}")
elif MODE == 'translate':
    gemini_chrf = chrf_for_key('tgt_pred_gemini')
    print(f"CHRF for Gemini: {gemini_chrf:.2f}")