In [1]:
# ============================
# Arabic NER with CRF (WikiANN)
# ============================

!pip install -q datasets seqeval sklearn-crfsuite

from datasets import load_dataset
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from seqeval.metrics import classification_report, f1_score

# ----------------------------
# 1) Load Arabic WikiANN NER
# ----------------------------
# This downloads the Arabic split of WikiANN
dataset = load_dataset("wikiann", "ar")

train_ds = dataset["train"]
val_ds   = dataset["validation"]
test_ds  = dataset["test"]

print("Train size:", len(train_ds))
print("Val size:  ", len(val_ds))
print("Test size: ", len(test_ds))

# Label id -> name, e.g. 0:'O', 1:'B-PER', ...
label_names = train_ds.features["ner_tags"].feature.names
print("Labels:", label_names)

# (Optional) use a smaller subset to train faster while testing
# Comment these 3 lines if you want full dataset
train_ds = train_ds.select(range(20000))
val_ds   = val_ds.select(range(2000))
test_ds  = test_ds.select(range(2000))

# ----------------------------
# 2) Convert HF dataset to lists of tokens + labels
# ----------------------------
def hf_to_sentences(ds):
    """
    ds: HF split with fields 'tokens' and 'ner_tags'
    Returns: list of sentences,
             each sentence = list of (word, label_str)
    """
    sents = []
    for ex in ds:
        tokens = ex["tokens"]
        tags_ids = ex["ner_tags"]
        tags = [label_names[i] for i in tags_ids]
        sents.append(list(zip(tokens, tags)))
    return sents

train_sents = hf_to_sentences(train_ds)
val_sents   = hf_to_sentences(val_ds)
test_sents  = hf_to_sentences(test_ds)

print("#train sentences:", len(train_sents))
print("#val sentences:  ", len(val_sents))
print("#test sentences: ", len(test_sents))


# ----------------------------
# 3) Features for each token
# ----------------------------
def word2features(sent, i):
    """
    sent: list of (word, label) tuples
    i: index of token
    Returns: dict of features for CRF
    """
    word = sent[i][0]

    features = {
        "bias": 1.0,
        "word": word,
        "word.lower()": word.lower(),
        "word.isupper()": word.isupper(),
        "word.istitle()": word.istitle(),
        "word.isdigit()": word.isdigit(),
        "suffix3": word[-3:],
        "suffix2": word[-2:],
    }

    # previous word
    if i > 0:
        prev_word = sent[i-1][0]
        features.update({
            "-1:word": prev_word,
            "-1:word.lower()": prev_word.lower(),
            "-1:word.istitle()": prev_word.istitle(),
            "-1:word.isupper()": prev_word.isupper(),
        })
    else:
        features["BOS"] = True  # beginning of sentence

    # next word
    if i < len(sent) - 1:
        next_word = sent[i+1][0]
        features.update({
            "+1:word": next_word,
            "+1:word.lower()": next_word.lower(),
            "+1:word.istitle()": next_word.istitle(),
            "+1:word.isupper()": next_word.isupper(),
        })
    else:
        features["EOS"] = True  # end of sentence

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for (_, label) in sent]

def sent2tokens(sent):
    return [word for (word, _) in sent]


# ----------------------------
# 4) Prepare data for CRF
# ----------------------------
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s)  for s in train_sents]

X_val   = [sent2features(s) for s in val_sents]
y_val   = [sent2labels(s)  for s in val_sents]

X_test  = [sent2features(s) for s in test_sents]
y_test  = [sent2labels(s)  for s in test_sents]


# ----------------------------
# 5) Train CRF model
# ----------------------------
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)

print("Training CRF NER model...")
crf.fit(X_train, y_train)
print("Training done.")

# ----------------------------
# 6) Evaluate on validation + test
# ----------------------------
y_val_pred = crf.predict(X_val)
y_test_pred = crf.predict(X_test)

print("Validation F1:", f1_score(y_val, y_val_pred))
print("Test F1:", f1_score(y_test, y_test_pred))
print("\nDetailed classification report (test):")
print(classification_report(y_test, y_test_pred))


# ----------------------------
# 7) Predict on a custom sentence
# ----------------------------
def predict_sentence_crf(tokens):
    """
    tokens: list of Arabic words
    """
    sent = [(w, "O") for w in tokens]  # dummy labels
    feats = sent2features(sent)
    pred = crf.predict_single(feats)
    return list(zip(tokens, pred))

example = ["سافر", "الرئيس", "إلى", "دمشق", "أمس", "."]
print("\nExample prediction:")
print(predict_sentence_crf(example))


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ar/validation-00000-of-00001.parquet:   0%|          | 0.00/643k [00:00<?, ?B/s]

ar/test-00000-of-00001.parquet:   0%|          | 0.00/648k [00:00<?, ?B/s]

ar/train-00000-of-00001.parquet:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Train size: 20000
Val size:   10000
Test size:  10000
Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
#train sentences: 20000
#val sentences:   2000
#test sentences:  2000
Training CRF NER model...
Training done.
Validation F1: 0.8264236902050114
Test F1: 0.8311273391145594

Detailed classification report (test):
              precision    recall  f1-score   support

         LOC       0.87      0.83      0.85       764
         ORG       0.86      0.79      0.82       724
         PER       0.85      0.80      0.82       774

   micro avg       0.86      0.81      0.83      2262
   macro avg       0.86      0.80      0.83      2262
weighted avg       0.86      0.81      0.83      2262


Example prediction:
[('سافر', 'O'), ('الرئيس', 'O'), ('إلى', 'O'), ('دمشق', 'B-LOC'), ('أمس', 'I-LOC'), ('.', 'O')]
