In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# this is the research paper this algorithm is adopted from [https://arxiv.org/abs/2303.14588]

/kaggle/input/val-nlp-project/val.txt
/kaggle/input/train-nlp-project/train.txt


In [2]:
# Cell 1: CHECK GPU
# Run this to know which GPU Kaggle assigned you.
# If nvidia-smi is absent, Kaggle might not have GPU enabled for the session.
import os
gpu_info = !nvidia-smi -L || true
print("GPU info (if available):")
print("\n".join(gpu_info))

GPU info (if available):
GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-aa9372c0-5736-d2ba-a5e4-29eaa5d4070f)


In [3]:
# Step 1: uninstall conflicting packages first
!pip uninstall -y pyarrow datasets transformers
!pip uninstall -y pyarrow datasets fastparquet

# Step 2: install compatible versions (single line)
!pip install -q pyarrow==19.0.0 datasets==2.14.7 transformers==4.35.2 accelerate==0.24.1 peft==0.7.1 sentencepiece evaluate



Found existing installation: pyarrow 19.0.0
Uninstalling pyarrow-19.0.0:
  Successfully uninstalled pyarrow-19.0.0
Found existing installation: datasets 2.14.7
Uninstalling datasets-2.14.7:
  Successfully uninstalled datasets-2.14.7
Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
sentence-transformers 5.1.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.35.2 which is incompatible.
bigframes 2.26.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.[0m[31m
[0m

In [4]:
# Cell 3: Imports and device setup
import os
import re
import math
from pathlib import Path
from typing import List, Dict

import torch
from datasets import Dataset, DatasetDict
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
metric = evaluate.load("accuracy") 
# Device (torch) — Trainer uses this internally but it's useful
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

MODEL_NAME = "google/byt5-small"  # small version (recommended start)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)  # ByT5 uses byte-level tokenization
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
print("Loaded model:", MODEL_NAME)



  _torch_pytree._register_pytree_node(
2025-12-17 10:50:24.215103: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765968624.361858     254 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765968624.408916     254 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765968624.779161     254 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765968624.779202     254 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765968624.779205     254 computatio

Downloading builder script: 0.00B [00:00, ?B/s]

Device: cuda




tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loaded model: google/byt5-small


In [5]:
# importance of normalization : Convert everything to the standard Arabic set so the model learns consistently.
# Cell 4: Unicode diacritics removal and normalization helpers

# Arabic diacritics (Tashkeel) common ranges we remove:

import re
import unicodedata

# Arabic diacritics (Harakat)
DIACRITICS = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")

def remove_diacritics(text: str) -> str:
    """Remove Arabic diacritics (tashkeel) from text."""
    return DIACRITICS.sub("", text)

def normalize_text_advanced(text):
    # ---------------------------------------------------------
    # 0. Unicode Normalize (very important)
    # ---------------------------------------------------------
    text = unicodedata.normalize("NFKC", text)

    # ---------------------------------------------------------
    # 1. Fix common Arabic ligatures (ﻻ → لا)
    # ---------------------------------------------------------
    text = text.replace("\uFEFB", "لا").replace("\uFEFC", "لا")  # Lam-Alef ligatures

    # ---------------------------------------------------------
    # 2. Remove Tatweel
    # ---------------------------------------------------------
    text = text.replace("ـ", "")

    # ---------------------------------------------------------
    # 3. Normalize letter shapes
    # ---------------------------------------------------------
    replacements = {
        "\u0622": "ا",  # Alef with madda
        "\u0623": "ا",  # Alef with hamza above
        "\u0625": "ا",  # Alef with hamza below
        "\u0671": "ا",  # Alef wasla
        "\u0649": "ي",  # Alef Maqsura → Yeh
        "\u06CC": "ي",  # Persian Yeh → Arabic Yeh
        "\u0643": "ك",  # Normalize Kaf
        "\u06A9": "ك",  # Persian Kaf → Arabic Kaf
    }
    for k, v in replacements.items():
        text = text.replace(k, v)

    # ---------------------------------------------------------
    # 4. Remove page number references like ( 21 / 227 )
    # ---------------------------------------------------------
    text = re.sub(r"\(\s*\d+\s*/\s*\d+\s*\)", " ", text)

    # ---------------------------------------------------------
    # 5. Remove pure numeric brackets (e.g., ( 48 ))
    # ---------------------------------------------------------
    text = re.sub(r"\(\s*\d+\s*\)", " ", text)

    # ---------------------------------------------------------
    # 6. Remove content in brackets when it is clearly commentary
    #    Example: ( قَوْلُهُ : كَفَى )
    # ---------------------------------------------------------
    text = re.sub(r"\(\s*[^()]*?\d+[^()]*?\)", " ", text)  # commentary w/ numbers
    # Optional: remove ALL bracketed commentary (if you want)
    # text = re.sub(r"[\(\[][^()\[\]]*[\)\]]", " ", text)

    # ---------------------------------------------------------
    # 7. Remove double diacritics (very important)
    # ---------------------------------------------------------
    text = re.sub(f"({DIACRITICS})+", r"\1", text)

    # ---------------------------------------------------------
    # 8. Remove weird invisible characters
    # ---------------------------------------------------------
    invisible_chars = [
        "\u200C",  # Zero-width non-joiner
        "\u200D",  # Zero-width joiner
        "\u200E",  # Left-to-right mark
        "\u200F",  # Right-to-left mark
        "\u202A", "\u202B", "\u202C", "\u202D", "\u202E",
    ]
    for ch in invisible_chars:
        text = text.replace(ch, "")

    # ---------------------------------------------------------
    # 9. Fix spacing around punctuation
    # ---------------------------------------------------------
    text = re.sub(r"\s+([.,؛،؟])", r"\1", text)  # no space before punctuation
    text = re.sub(r"([.,؛،؟])\s*", r"\1 ", text)  # one space after punctuation

    # ---------------------------------------------------------
    # 10. Collapse multiple spaces
    # ---------------------------------------------------------
    text = re.sub(r"\s{2,}", " ", text)

    return text.strip()

# Quick sanity test:
sample = "وَلَوْ جَمَعَ ثُمَّ عَلِمَ"
print("orig:", sample)
print("no diac:", remove_diacritics(sample))
print("norm:", normalize_text_advanced(remove_diacritics(sample)))

orig: وَلَوْ جَمَعَ ثُمَّ عَلِمَ
no diac: ولو جمع ثم علم
norm: ولو جمع ثم علم


In [6]:
from datasets import load_dataset
# Cell 5: Load files and create pairs (input = undiacritized, target = original)
# Put your train.txt and val.txt into the Kaggle working directory or dataset input path.


TRAIN_PATH = "/kaggle/input/train-nlp-project/train.txt"
VAL_PATH = "/kaggle/input/val-nlp-project/val.txt"

def load_pairs_from_file(path):
    inputs = []
    targets = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            target = line
            src = remove_diacritics(line)
            src = normalize_text_advanced(src)
            # optional: add bos/eos tokens? ByT5's tokenizer handles that
            inputs.append(src)
            targets.append(target)
    return inputs, targets

train_srcs, train_tgts = load_pairs_from_file(TRAIN_PATH)
val_srcs, val_tgts = load_pairs_from_file(VAL_PATH)

print("Loaded samples:", len(train_srcs), "train,", len(val_srcs), "val")
print("example input -> target:")
print(train_srcs[0])
print(train_tgts[0])



Loaded samples: 50000 train, 2500 val
example input -> target:
ولو جمع ثم علم ترك ركن من الاولي بطلتا ويعيدهما جامعا، او من الثانية، فان لم يطل تدارك، والا فباطلة ولا جمع، ولو جهل اعادهما لوقتيهما
وَلَوْ جَمَعَ ثُمَّ عَلِمَ تَرْكَ رُكْنٍ مِنْ الْأُولَى بَطَلَتَا وَيُعِيدُهُمَا جَامِعًا ، أَوْ مِنْ الثَّانِيَةِ ، فَإِنْ لَمْ يَطُلْ تَدَارَكَ ، وَإِلَّا فَبَاطِلَةٌ وَلَا جَمَعَ ، وَلَوْ جَهِلَ أَعَادَهُمَا لِوَقْتَيْهِمَا


In [7]:
from datasets import Dataset, DatasetDict
final_inputs = train_srcs 
final_targets = train_tgts 

# Cell 6: Make HuggingFace datasets
train_dataset = Dataset.from_dict({
    "input_text": final_inputs,
    "target_text": final_targets
})

val_dataset = Dataset.from_dict({
    "input_text": val_srcs,
    "target_text": val_tgts
})

datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 2500
    })
})


In [None]:
# # Cell 8: Tokenize examples for seq2seq training
# # We need to tokenize inputs and targets; set reasonable max lengths.

MAX_INPUT_LENGTH = 512  # ByT5 uses bytes; 512 is safe for sentences
MAX_TARGET_LENGTH = 512

    
def preprocess_batch(examples):
    # examples: dict with lists input_text, target_text
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )
    # Tokenize targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Step 1: tokenize inputs and targets
tokenized = datasets.map(preprocess_batch, batched=True, remove_columns=datasets["train"].column_names)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



In [None]:
all_labels = []
for example in tokenized["train"]:
    all_labels.extend(example["labels"])

print("Max label ID:", max(all_labels))
print("Tokenizer vocab size:", tokenizer.vocab_size)


In [None]:
!pip install jiwer

In [None]:
# Cell 9: Data collator and metrics

from transformers import DataCollatorForSeq2Seq
from evaluate import load   
# no more load_metric

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100  # this is important
)


# Load WER metric from evaluate
wer_metric = load("wer")


In [None]:
# Cell 10: DER implementation (character-level)
# DER = (# incorrect diacritic characters predicted) / (# characters that should have diacritics) * 100

def calc_der_for_pair(gold: str, pred: str):
    """Compute counts for one pair: returns (total_diacritics_in_gold, wrong_diacritics_predicted)."""
    # We'll iterate over gold and pred by base characters.
    # Simpler approach: strip diacritics to get base char sequence, then align.
    gold_bases = remove_diacritics(gold)
    pred_bases = remove_diacritics(pred)

    # If base lengths differ, align via a simple heuristic (we'll fallback to char-level min length)
    L = min(len(gold_bases), len(pred_bases))
    total_diac = 0
    wrong = 0

    gi = 0
    pi = 0
    # iterate over positions in bases (simple alignment)
    for k in range(L):
        # find indices in original strings that correspond to the k-th base char
        # This is a little code-heavy but robust enough.
        # Build lists of base char indices once (optimization skipped)
        pass

# Simpler robust implementation: expand each string into sequence of (base_char, diacritic_str)
def expand_to_bases_and_diacritics(s: str):
    base_chars = []
    diacritic_strs = []
    current_base = None
    current_diacs = []
    for ch in s:
        if DIACRITICS.match(ch):
            if current_base is not None:
                current_diacs.append(ch)
            # else stray diacritic before base? ignore
        else:
            # new base char
            if current_base is not None:
                base_chars.append(current_base)
                diacritic_strs.append("".join(current_diacs))
            current_base = ch
            current_diacs = []
    if current_base is not None:
        base_chars.append(current_base)
        diacritic_strs.append("".join(current_diacs))
    return base_chars, diacritic_strs

def der_counts(gold: str, pred: str):
    gb, gd = expand_to_bases_and_diacritics(gold)
    pb, pd = expand_to_bases_and_diacritics(pred)
    L = min(len(gb), len(pb))
    total_diac = 0
    wrong = 0
    for i in range(L):
        gold_di = gd[i]
        pred_di = pd[i]
        if gold_di != "":
            total_diac += 1
            if gold_di != pred_di:
                wrong += 1
    # if gold has extra chars beyond pred, count their diacritics as errors
    if len(gb) > L:
        for i in range(L, len(gb)):
            if gd[i] != "":
                total_diac += 1
                wrong += 1
    return total_diac, wrong

# Quick check
g = "وَلَوْ جَمَعَ"
p = "ولو جمع"
print("DER counts:", der_counts(g, p))

In [None]:
# Cell 11: Training arguments
# Adjust batch size depending on GPU:
# - If T4: per_device_train_batch_size = 4 (or 2) 
# - If P100/V100: can try 8 or 16 for small
# We'll use gradient accumulation to simulate larger effective batch.

GPU_INFO = !nvidia-smi -L || true
gpu_str = "\n".join(GPU_INFO)
print("GPU:", gpu_str)

# Heuristics:
if "V100" in gpu_str or "A100" in gpu_str or "P100" in gpu_str:
    per_device_train_batch_size = 4
else:
    per_device_train_batch_size = 2  # safer for T4

training_args = Seq2SeqTrainingArguments(
    output_dir="byt5_diacritizer",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size,
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=500,        # evaluate every 500 steps (tune for dataset size)
    logging_steps=100,
    save_total_limit=3,
    save_steps=500,
    num_train_epochs=5,
    learning_rate=3e-4,    # ByT5 often likes 1e-4 - 5e-4; tune if needed
    weight_decay=0.01,
    warmup_steps=500,
    fp16=False,             # use mixed precision if GPU supports
    bf16=False,    
    gradient_accumulation_steps=4,  # increase effective batch
    gradient_checkpointing=True,           # Saves ~30 to 40% memory
    load_best_model_at_end=True,
    metric_for_best_model="char_acc",
    greater_is_better=True,
    eval_accumulation_steps=4,
    generation_max_length=MAX_TARGET_LENGTH,
    report_to="none",
)
print(training_args)


In [None]:
# Cell 12: compute_metrics function for Trainer
import numpy as np
from transformers import EvalPrediction

def postprocess_text(preds, labels):
    # Labels may contain -100. Replace with pad token id for decoding
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    # decode token ids to strings
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return decoded_preds, decoded_labels

def compute_metrics(eval_preds):
    # eval_preds: (pred_ids, labels_ids, (maybe) metrics)
    preds_ids, labels_ids = eval_preds
    if isinstance(preds_ids, tuple):
        preds_ids = preds_ids[0]
    # decode
    decoded_preds, decoded_labels = postprocess_text(preds_ids, labels_ids)
    
    # compute simple token-level metrics: wer
    wer = wer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    # compute DER
    total_diac = 0
    wrong_diac = 0
    total_chars = 0
    correct_chars = 0
    for g, p in zip(decoded_labels, decoded_preds):
        td, wd = der_counts(g, p)
        total_diac += td
        wrong_diac += wd
        # simple char accuracy (consider base chars only)
        gb, gd = expand_to_bases_and_diacritics(g)
        pb, pd = expand_to_bases_and_diacritics(p)
        L = min(len(g_bases), len(p_bases))
        for i in range(L):
            total_chars += 1
            if g_bases[i] == p_bases[i] and ( (gd := expand_to_bases_and_diacritics(g)[1][i]) == (pd := expand_to_bases_and_diacritics(p)[1][i]) ):
                correct_chars += 1
    der = 100.0 * wrong_diac / total_diac if total_diac > 0 else 0.0
    char_acc = 100.0 * correct_chars / total_chars if total_chars > 0 else 0.0

    return {
        "wer": wer,
        "DER": der,
        "char_acc": char_acc,
    }


In [None]:
example = tokenized['train'][0]  # pick first example
print("Input IDs type:", type(example['input_ids']))
print("Input IDs length:", len(example['input_ids']))
print("Labels length:", len(example['labels']))
decoded_input = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
decoded_label = tokenizer.decode(example['labels'], skip_special_tokens=True)
print("Decoded input:", decoded_input)
print("Decoded target:", decoded_label)

In [None]:
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenized["train"][1]["labels"])

print(tokenizer.unk_token_id)
print(len(tokenizer))


In [None]:
# Cell 13: Create Seq2SeqTrainer and train

output_dir = "/kaggle/working/byt5_diacritizer"

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Auto resume if a checkpoint already exists
# Automatically resumes if a checkpoint exists in output_dir
trainer.train()


# Save final state
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
# Cell 14: Save and quick test
best_model_path = "byt5_diacritizer/best_model"
trainer.save_model(best_model_path)
print("Saved best model to", best_model_path)

# Simple inference function
from transformers import pipeline
pipe = pipeline("text2text-generation", model=best_model_path, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

def diacritize(text: str, max_length: int = 512):
    src = normalize_text_advanced(remove_diacritics(text))
    out = pipe(src, max_length=max_length, do_sample=False)[0]["generated_text"]
    return out

# Example:
print("Input:", "ولو جمع ثم علم ترك ركن")
print("Output:", diacritize("وَلَوْ جَمَعَ ثُمَّ عَلِمَ تَرْكَ رُكْنٍ"))
