# Deep Past Initiative – Machine Translation (Training Notebook)

This notebook is a **starter / baseline** for this Kaggle competition.

Main ideas:
- Use **ByT5** to handle noisy Akkadian transliterations at the character level
- Perform **simple sentence alignment** to increase training data
- Fine-tune using HuggingFace `Trainer`


Inference Code is [here](https://www.kaggle.com/code/takamichitoda/dpc-starter-infer).

In [1]:
!pip install evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
Successfully installed evaluate-0.4.6 portalocker-3.2.0 sacrebleu-2.6.0


In [2]:
import os
import gc
import re
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
from sentence_transformers import SentenceTransformer, util
import evaluate

2026-02-20 17:54:37.644159: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771610078.000791      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771610078.087870      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771610078.957435      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771610078.957509      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771610078.957513      24 computation_placer.cc:177] computation placer alr

In [3]:
class Config:
    # Akkadian transliteration contains a lot of noise and many unknown words, so
    # ByT5, which processes text at the character (byte) level rather than the word level, is the strongest choice.
    # MODEL_NAME = "google/byt5-small" 
    MODEL_NAME = "/kaggle/input/notebooks/shwesh/dpc-starter-train/byt5-akkadian-model/" 
    
    # ByT5 tends to produce longer token sequences, but 512 tokens is enough at the sentence level.
    MAX_LENGTH = 512
    
    BATCH_SIZE = 8       # Adjust depending on GPU memory (on a P100 you can usually go with 8–16).
    EPOCHS = 10
    LEARNING_RATE = 2e-4
    OUTPUT_DIR = "./byt5-akkadian-model"

In [4]:
# Fix the seed (for reproducibility).
def seed_everything(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    
seed_everything()

In [5]:
INPUT_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
train_df = pd.read_csv(f"{INPUT_DIR}/train.csv")
test_df = pd.read_csv(f"{INPUT_DIR}/test.csv")

In [6]:
print(f"Original Train Data: {len(train_df)} docs")

Original Train Data: 1562 docs


In [7]:
def simple_sentence_aligner(df):
    """
    【戦略の肝】
    Trainデータの「文書(複数文)」を、Testデータと同じ「文(1文)」に分割します。
    ここでは「英語の文数」と「アッカド語の行数」が一致する場合のみ分割する
    というヒューリスティック（簡易ルール）を使います。
    """
    aligned_data = []
    
    for idx, row in df.iterrows():
        src = str(row['transliteration'])
        tgt = str(row['translation'])
        
        # Split the English text by sentence-ending punctuation.
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if t.strip()]
        
        # Assume the Akkadian text is often separated by newlines and split accordingly.
        src_lines = [s.strip() for s in src.split('\n') if s.strip()]
        
        # If the counts match, trust it as 1-to-1 pairs and use the split version.
        if len(tgt_sents) > 1 and len(tgt_sents) == len(src_lines):
            for s, t in zip(src_lines, tgt_sents):
                if len(s) > 3 and len(t) > 3: # Remove junk/noisy data.
                    aligned_data.append({'transliteration': s, 'translation': t})
        else:
            # If splitting fails (counts don't match), keep the original document pair as-is (safe fallback).
            aligned_data.append({'transliteration': src, 'translation': tgt})
            
    return pd.DataFrame(aligned_data)

def instance_crossover_augmentation(df, num_augmentations=1000):
    """
    Take two random datapoints, cut around midway through it, and stitch it back as two new datapoints. Append a marker at the front to signify it is stitched and false.
    """
    augmented_data = []
    
    for _ in range(num_augmentations): # Generate augmented data.
        rows = df.sample(2)
        row1 = rows.iloc[0]
        row2 = rows.iloc[1]
        src1 = str(row1['transliteration'])
        tgt1 = str(row1['translation'])
        src2 = str(row2['transliteration'])
        tgt2 = str(row2['translation'])
        # randomly choose a point between 0.1 and 0.9 to put the slice
        slice_point = np.random.uniform(0.1, 0.9)


        # splice
        idx1 = int(len(src1) * slice_point)
        idx2 = int(len(src2) * slice_point)
        idx3 = int(len(tgt1) * slice_point)
        idx4 = int(len(tgt2) * slice_point)

        new_src1 = src1[idx1:] + src2[:idx2]
        new_src2 = src2[idx2:] + src1[:idx1]
        new_tgt1 = tgt1[idx3:] + tgt2[:idx4]
        new_tgt2 = tgt2[idx4:] + tgt1[:idx3]

        # mark
        new_src1 = "[AUG]" + new_src1
        new_src2 = "[AUG]" + new_src2

        augmented_data.append({'transliteration': new_src1, 'translation': new_tgt1})
        augmented_data.append({'transliteration': new_src2, 'translation': new_tgt2})
    return pd.DataFrame(augmented_data)

In [8]:
# Run data augmentation.
train_expanded = simple_sentence_aligner(train_df)
print(f"Expanded Train Data: {len(train_expanded)} sentences (Alignment applied)")
print(train_expanded.head())

Expanded Train Data: 1562 sentences (Alignment applied)
                                     transliteration  \
0  KIŠIB ma-nu-ba-lúm-a-šur DUMU ṣí-lá-{d}IM KIŠI...   
1               1 TÚG ša qá-tim i-tur4-DINGIR il5-qé   
2  <gap> TÚG u-la i-dí-na-ku-um i-tù-ra-ma 9 GÍN ...   
3  KIŠIB šu-{d}EN.LÍL DUMU šu-ku-bi-im KIŠIB ṣí-l...   
4  um-ma šu-ku-tum-ma a-na IŠTAR-lá-ma-sí ù ni-ta...   

                                         translation  
0  Seal of Mannum-balum-Aššur son of Ṣilli-Adad, ...  
1  Itūr-ilī has received one textile of ordinary ...  
2   <gap> he did not give you a textile. He retur...  
3  Seal of Šu-Illil son of Šu-Kūbum, seal of Ṣilū...  
4  From Šukkutum to Ištar-lamassī and Nitahšušar:...  


In [9]:
# More data augmentation
augmented_df = instance_crossover_augmentation(train_expanded, num_augmentations=len(train_expanded))
print(f"Augmented Train Data: {len(augmented_df)} sentences (Instance crossover applied)")
print(augmented_df.head())

# Combine 
train_expanded = pd.concat([train_expanded, augmented_df], ignore_index=True)



Augmented Train Data: 3124 sentences (Instance crossover applied)
                                     transliteration  \
0  [AUG]a a-šur-re-ṣí ŠU.NÍGIN 4 ri-ik-sú ša e-lá...   
1  [AUG]a-ar-gu5-ma-ni ta-ša-qá-al a-pu-tum ṭup-p...   
2  [AUG]> a-we-lu-ú ú-nu-hu a-ṣé-er 40 GÚ URUDU b...   
3  [AUG]> ša a-lá-hi-im ša e-li-tí-šu ku-nu-ki-a ...   
4  [AUG]í-bi4-ma šu-ma na-áš-pé-er-tum iš-tù a-li...   

                                         translation  
0  r-rēṣī, in all 4 packets of Elamma, all this b...  
1  ill pay in accordance with what you claimed fr...  
2  lents of copper, our fathers assets, but they ...  
3  silver belonging to Ali-ahum from his caravan,...  
4  ur: Ask whether they have brought a missive fr...  


In [10]:



# Convert to Hugging Face Dataset format & split into Train/Val.
dataset = Dataset.from_pandas(train_expanded)
# Create a validation set with test_size=0.1.
split_datasets = dataset.train_test_split(test_size=0.1, seed=42)
# After splitting, the keys are 'train' and 'test' (we'll use 'test' as validation).


In [11]:
# ==========================================
# 3. Tokenization & preprocessing
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME, local_files_only=True)

# Fix the corresponding section in dpc-starter-train.
PREFIX = "translate Akkadian to English: "

def preprocess_function(examples):
    inputs = [PREFIX + str(ex) for ex in examples["transliteration"]]
    targets = [str(ex) for ex in examples["translation"]]
    
    model_inputs = tokenizer(inputs, max_length=Config.MAX_LENGTH, truncation=True)
    labels = tokenizer(targets, max_length=Config.MAX_LENGTH, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = split_datasets["train"].map(preprocess_function, batched=True)
tokenized_val = split_datasets["test"].map(preprocess_function, batched=True)


Map:   0%|          | 0/4217 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

In [12]:
# ==========================================
# 4. Model training (fine-tuning)
# ==========================================
gc.collect()
torch.cuda.empty_cache()
model = AutoModelForSeq2SeqLM.from_pretrained(Config.MODEL_NAME, local_files_only=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Metric (chrF++ is part of the competition metric and measures character-level precision/overlap).
metric = evaluate.load("chrf")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple): preds = preds[0]
    try:
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    except:
        print(f"The bad preds are: {preds}")
        print(f"with type:{preds.__class__.__name__}")
        print("Ignoring computing metrics and continuing onward")
        return {"chrf": 0} 
    # Ignore -100 in the labels.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"chrf": result["score"]}

args = Seq2SeqTrainingArguments(
    output_dir=Config.OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=Config.LEARNING_RATE,
    
    # === Key fixes ===
    fp16=False,                     # ★Set to False to prevent a NaN error (required).
    per_device_train_batch_size=4,  # ★fp32 uses more memory, so reduce the batch size (8 -> 4).
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,  # ★To compensate, accumulate gradients to keep the effective batch size at 8.
    # ======================
    
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=Config.EPOCHS,
    predict_with_generate=True,
    logging_steps=10,               # Inspect logs in more detail.
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Starting Training (FP32 mode)...")
trainer.train()


The module name  (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(


Starting Training (FP32 mode)...


Epoch,Training Loss,Validation Loss,Chrf
1,0.0977,0.180776,4.146113
2,0.0815,0.181671,4.151254
3,0.0891,0.179462,4.29164
4,0.0752,0.17956,4.283685
5,0.0797,0.174133,4.319517
6,0.0695,0.176645,4.318218
7,0.0844,0.17317,4.269257
8,0.0746,0.172672,4.265569
9,0.0679,0.165812,4.302302
10,0.0886,0.161863,4.365128


TrainOutput(global_step=5280, training_loss=0.08047292969669356, metrics={'train_runtime': 7536.0903, 'train_samples_per_second': 5.596, 'train_steps_per_second': 0.701, 'total_flos': 3.816785055140966e+16, 'train_loss': 0.08047292969669356, 'epoch': 10.0})

In [13]:
# --- Save Model ---
# Important: the model saved here will be loaded in the next notebook.
trainer.save_model(Config.OUTPUT_DIR)
tokenizer.save_pretrained(Config.OUTPUT_DIR)
print(f"Model saved to {Config.OUTPUT_DIR}")


Model saved to ./byt5-akkadian-model
