# Deep Past Challenge: ByT5 Inference

Kaggle submission notebook. Loads the fine-tuned ByT5 model and generates translations.

**Requirements**:
- Add your trained model as a Kaggle dataset (e.g. `your-username/byt5-akkadian-final`)
- No internet access needed (offline inference)
- Must complete within 9 hours

In [None]:
# ============================================================
# Configuration
# ============================================================
IS_KAGGLE = os.path.exists("/kaggle/input")

if IS_KAGGLE:
    COMP_DATA = "/kaggle/input/deep-past-initiative-machine-translation"
    # Trained model uploaded as Kaggle dataset
    MODEL_PATH = "/kaggle/input/byt5-akkadian-final"
else:
    COMP_DATA = "data"
    MODEL_PATH = "trained_model/byt5_stage2_final"

PREFIX = "translate Akkadian to English: "
MAX_SOURCE_LEN = 384
MAX_TARGET_LEN = 384
BEAM_WIDTH = 4
REP_PENALTY = 1.2
BATCH_SIZE = 16

In [None]:
# ============================================================
# Configuration
# ============================================================
IS_KAGGLE = os.path.exists("/kaggle/input")

if IS_KAGGLE:
    COMP_DATA = "/kaggle/input/deep-past-initiative-machine-translation"
    # Update this to your uploaded model dataset path:
    MODEL_PATH = "/kaggle/input/byt5-akkadian-final/byt5_stage2_final"
else:
    COMP_DATA = "data"
    MODEL_PATH = "output/byt5_stage2_final"

PREFIX = "translate Akkadian to English: "
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 512
BEAM_WIDTH = 4
REP_PENALTY = 1.2
BATCH_SIZE = 16

In [None]:
# ============================================================
# Preprocessing (must match training)
# ============================================================
SUBSCRIPT_MAP = str.maketrans("\u2080\u2081\u2082\u2083\u2084\u2085\u2086\u2087\u2088\u2089",
                              "0123456789")

ASCII_TO_DIACRITIC = {
    "sz": "\u0161", "SZ": "\u0160", "Sz": "\u0160",
    "sh": "\u0161", "SH": "\u0160", "Sh": "\u0160",
    "s,": "\u1E63", "S,": "\u1E62",
    "t,": "\u1E6D", "T,": "\u1E6C",
    ".s": "\u1E63", ".S": "\u1E62",
    ".t": "\u1E6D", ".T": "\u1E6C",
    "h,": "\u1E2B", "H,": "\u1E2A",
    ".h": "\u1E2B", ".H": "\u1E2A",
}


def normalize_ascii(text):
    for old, new in ASCII_TO_DIACRITIC.items():
        text = text.replace(old, new)
    return text


def normalize_gaps(text):
    text = re.sub(r'\[x\]', '<gap>', text)
    text = re.sub(r'\[\.{3,}[^\]]*\]', '<big_gap>', text)
    text = re.sub(r'\.{3,}', '<big_gap>', text)
    text = re.sub(r'\u2026', '<big_gap>', text)
    return text


def clean_akkadian(text):
    if pd.isna(text) or not str(text).strip():
        return ""
    text = str(text)
    text = unicodedata.normalize("NFC", text)
    text = text.replace("!", "").replace("?", "")
    text = re.sub(r'[\u02F9\u02FA]', '', text)
    text = re.sub(r'\[([^\]]*)\]', r'\1', text)
    text = normalize_ascii(text)
    text = normalize_gaps(text)
    text = text.translate(SUBSCRIPT_MAP)
    text = re.sub(r'[/:.](?![\d])', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# ============================================================
# Load Model
# ============================================================
print(f"Loading model from {MODEL_PATH}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)
model = model.to(DEVICE)
model.eval()
print("Model loaded successfully!")

In [None]:
# ============================================================
# Load Test Data
# ============================================================
test_df = pd.read_csv(os.path.join(COMP_DATA, "test.csv"))
print(f"Test data: {len(test_df)} rows")
print(test_df.head())

# Preprocess
test_df['clean_src'] = test_df['transliteration'].apply(clean_akkadian)
print(f"\nSample cleaned:")
print(test_df[['transliteration', 'clean_src']].head())

In [None]:
# ============================================================
# Generate Translations
# ============================================================
predictions = []

for i in tqdm(range(0, len(test_df), BATCH_SIZE), desc="Translating"):
    batch_texts = [PREFIX + t for t in test_df['clean_src'].iloc[i:i+BATCH_SIZE]]
    
    inputs = tokenizer(
        batch_texts,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=MAX_SOURCE_LEN
    ).to(DEVICE)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_TARGET_LEN,
            num_beams=BEAM_WIDTH,
            repetition_penalty=REP_PENALTY,
            length_penalty=1.0,
        )
    
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(preds)

print(f"\nGenerated {len(predictions)} translations")

In [None]:
# ============================================================
# Create Submission
# ============================================================
submission = pd.DataFrame({
    'id': test_df['id'],
    'translation': predictions
})

# Ensure no NaN translations
submission['translation'] = submission['translation'].fillna('')

submission.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")
print(f"Shape: {submission.shape}")
print(submission.head(10))

In [None]:
# Show sample translations
print("\n" + "="*60)
print("  Sample Translations")
print("="*60)
for i in range(min(5, len(test_df))):
    print(f"\n--- Test {i} ---")
    print(f"SRC: {test_df.iloc[i]['transliteration'][:200]}")
    print(f"PRED: {predictions[i][:200]}")