In [14]:
import unicodedata
from datasets import load_dataset
import re
import ollama
import torch
import json
from tqdm import tqdm
from pathlib import Path

In [15]:
def normalize_text(text):
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.replace(" ,", ",").replace(" .", ".")
    return text

def clean_pair(src, tgt):
    if len(src.split()) < 2 or len(tgt.split()) < 2:
        return None
    if len(src.split()) > 150 or len(tgt.split()) > 150:
        return None

    ratio = len(src.split()) / max(len(tgt.split()), 1)
    if ratio > 3.0 or ratio < (1/3):
        return None

    return src, tgt

def dump_jsonl(path, data):
    with open(path, "w", encoding="utf8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def prepare_wmt16(output_root="data/raw"):
    Path(output_root).mkdir(parents=True, exist_ok=True)

    ds = load_dataset("wmt16", "ro-en")

    for split in ["train", "validation", "test"]:
        processed = []
        seen = set()

        for item in tqdm(ds[split]):
            src = normalize_text(item["translation"]["en"])
            tgt = normalize_text(item["translation"]["ro"])

            cleaned = clean_pair(src, tgt)
            if not cleaned:
                continue

            # Dedup
            key = (src, tgt)
            if key in seen:
                continue
            seen.add(key)

            processed.append({"src": src, "tgt": tgt})

        dump_jsonl(f"{output_root}/{split}.jsonl", processed)
        print(f"> Saved {len(processed)} items for split={split}")

In [3]:
prepare_wmt16()

100%|██████████| 610320/610320 [00:20<00:00, 29085.59it/s]


> Saved 596819 items for split=train


100%|██████████| 1999/1999 [00:00<00:00, 28667.99it/s]


> Saved 1996 items for split=validation


100%|██████████| 1999/1999 [00:00<00:00, 28906.59it/s]

> Saved 1994 items for split=test





## Baseline Translation Using an Open LLM

In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

PROMPT_TEMPLATE = """
Translate the following English sentence into natural Romanian.
Return ONLY the final translation. Do NOT include chain-of-thought or explanations.

Sentence:
{src}
"""

LLAMA_3_LATEST = "llama3:latest"
QWEN_3 = "qwen3:4b"
DEEP_SEEK_R1 = "deepseek-r1:8b"
GPT_OS_20 = "gpt-oss:20b"
GEMMA3 = "gemma3:4b"


Using device:  cpu


In [61]:
def load_jsonl(path):
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            yield json.loads(line)

def translate(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

def save_jsonl(path, items):
    with open(path, "w", encoding="utf8") as f:
        for item in items:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def translate_text(model_name, src):
    prompt = PROMPT_TEMPLATE.format(src=src)
    response = ollama.chat(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        options={"num_predict": 200}
    )
    return response["message"]["content"]

def batch_translate_ollama(input_path, output_path, model_name="llama3:latest"):
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    results = []

    for item in tqdm(load_jsonl(input_path)):
        mt = translate_text(model_name, item["src"])
        results.append({
            "src": item["src"],
            "ref": item["tgt"],
            "mt": mt
        })

    save_jsonl(output_path, results)
    print(f"> saved {len(results)} translations to {output_path}")

In [62]:
batch_translate_ollama(
    "data/raw/test.jsonl",
    "experiments/baseline/GEMMA_3_translations.jsonl",
    model_name=GEMMA3
)

1994it [15:49,  2.10it/s]

> saved 1994 translations to experiments/baseline/GEMMA_3_translations.jsonl





In [63]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import math

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class MGPTPerplexityScorer:
    def __init__(self, model_name="ai-forever/mGPT"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)

    def sentence_ppl(self, text):
        enc = self.tokenizer(text, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            loss = self.model(**enc, labels=enc["input_ids"]).loss
        return math.exp(loss.item())

def compute_tsr_spans(sentence):
    """
    Placeholder
    """
    words = sentence.split()
    spans = []
    literal_count = 0

    # naive heuristic: English cognates or obvious calques
    cognates = {"informativ", "accident", "activitate", "popular", "important"}

    for i, w in enumerate(words):
        if w.lower() in cognates:
            literal_count += 1
            spans.append([i, i+1])

    tsr = literal_count / max(len(words), 1)
    return spans, tsr

In [64]:
import json
from tqdm import tqdm
from pathlib import Path

def load_jsonl(path):
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            yield json.loads(line)

def run_analysis(in_path, out_path):
    scorer = MGPTPerplexityScorer()
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)

    with open(out_path, "w", encoding="utf8") as fout:
        for item in tqdm(load_jsonl(in_path)):
            sentence = item["mt"]
            ppl = scorer.sentence_ppl(sentence)
            spans, tsr = compute_tsr_spans(sentence)

            rec = {
                "src": item["src"],
                "ref": item["ref"],
                "mt": sentence,
                "ppl": ppl,
                "tsr": tsr,
                "spans": spans
            }
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

In [19]:
run_analysis(
    "experiments/baseline/LLAMA_3_translations.jsonl",
    "experiments/baseline/LLAMA_3_translationese.jsonl"
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
0it [00:00, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
1994it [14:38,  2.27it/s]


In [65]:
run_analysis(
    "experiments/baseline/GEMMA_3_translations.jsonl",
    "experiments/baseline/GEMMA_3_translationese.jsonl"
)

0it [00:00, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
1994it [15:23,  2.16it/s]
