In [None]:
# =========================
# 1) Install libs
# =========================
!pip install -q transformers datasets seqeval accelerate

# =========================
# 2) Imports & data loader
# =========================
import numpy as np
from collections import defaultdict
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)

# ---------- read .conll ----------
def read_conll(path):
    """
    Reads simple CoNLL: word<TAB>tag, sentences separated by blank lines.
    Returns list of dicts: {"tokens": [...], "tags": [...]}
    """
    sentences = []
    tokens = []
    tags = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append({"tokens": tokens, "tags": tags})
                    tokens, tags = [], []
                continue
            parts = line.split("\t")
            if len(parts) != 2:
                continue
            word, tag = parts
            tokens.append(word)
            tags.append(tag)
    if tokens:
        sentences.append({"tokens": tokens, "tags": tags})
    return sentences

# load data (make sure these files exist in Colab)
train_data = read_conll("train.conll")
dev_data   = read_conll("dev.conll")
test_data  = read_conll("test.conll")

print("#train sentences:", len(train_data))
print("#dev sentences:", len(dev_data))
print("#test sentences:", len(test_data))

# =========================
# 3) Labels & HF Dataset
# =========================
# collect label set
label_set = set()
for split in [train_data, dev_data, test_data]:
    for ex in split:
        label_set.update(ex["tags"])

label_list = sorted(label_set)
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

print("Labels:", label_list)
print("Num labels:", len(label_list))

# convert to HF datasets
train_ds = Dataset.from_list(train_data)
dev_ds   = Dataset.from_list(dev_data)
test_ds  = Dataset.from_list(test_data)

raw_datasets = DatasetDict({
    "train": train_ds,
    "validation": dev_ds,
    "test": test_ds,
})
raw_datasets

# =========================
# 4) Tokenizer & alignment
# =========================
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(example):
    # padding='max_length' => all sequences have same length (fixes your error)
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=256,
    )

    word_ids = tokenized.word_ids()
    labels = []
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)  # ignore [CLS], [SEP], [PAD]
        else:
            labels.append(label_to_id[example["tags"][word_id]])
    tokenized["labels"] = labels
    return tokenized

tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=["tokens", "tags"],
)
tokenized_datasets

# =========================
# 5) Model & metrics
# =========================
num_labels = len(label_list)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id,
)

def compute_metrics(p):
    # works for both old/new transformers
    preds = p.predictions if hasattr(p, "predictions") else p[0]
    labels = p.label_ids if hasattr(p, "label_ids") else p[1]

    preds = np.argmax(preds, axis=-1)

    true_labels = []
    true_preds = []
    for pred_seq, label_seq in zip(preds, labels):
        for p_id, l_id in zip(pred_seq, label_seq):
            if l_id == -100:
                continue
            true_labels.append(label_list[l_id])
            true_preds.append(label_list[p_id])

    accuracy = np.mean(np.array(true_labels) == np.array(true_preds))
    return {"accuracy": accuracy}

# =========================
# 6) TrainingArguments (compatible with older versions)
# =========================
training_args = TrainingArguments(
    output_dir="./bert_pos_arabic",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
)

# =========================
# 7) Trainer & training
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# =========================
# 8) Evaluation on test set
# =========================
results = trainer.evaluate(tokenized_datasets["test"])
print("Test results:", results)

# =========================
# 9) Predict on a custom sentence (optional)
# =========================
import torch

def predict_sentence(tokens):
    """
    tokens: list of Arabic words, e.g. ["سوريا", "تستقبل", "وفدا", "رسميا", "."]
    """
    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=256,
    ).to(model.device)

    with torch.no_grad():
        outputs = model(**encoding)
    preds = outputs.logits.argmax(-1)[0].tolist()
    word_ids = encoding.word_ids()

    result = []
    used = set()
    for idx, (p_id, w_id) in enumerate(zip(preds, word_ids)):
        if w_id is None or w_id in used:
            continue
        used.add(w_id)
        result.append((tokens[w_id], id_to_label[p_id]))
    return result

example = ["سوريا", "تستقبل", "وفدا", "رسميا", "."]
print("Example prediction:", predict_sentence(example))


#train sentences: 6075
#dev sentences: 909
#test sentences: 680
Labels: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
Num labels: 17


Map:   0%|          | 0/6075 [00:00<?, ? examples/s]

Map:   0%|          | 0/909 [00:00<?, ? examples/s]

Map:   0%|          | 0/680 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.5514
200,0.2311
300,0.1977
400,0.1783
500,0.1414
600,0.1359
700,0.1249
800,0.1078
900,0.0955
1000,0.0832


Test results: {'eval_loss': 0.1719983071088791, 'eval_accuracy': 0.9562103041740224, 'eval_runtime': 9.3096, 'eval_samples_per_second': 73.043, 'eval_steps_per_second': 4.619, 'epoch': 3.0}
Example prediction: [('سوريا', 'X'), ('تستقبل', 'VERB'), ('وفدا', 'NOUN'), ('رسميا', 'ADJ'), ('.', 'PUNCT')]
