In [None]:
# !pip install transformers datasets seqeval accelerate

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
def read_ner_file(filepath):
    sentences = []
    sentence = []
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split("\t")
                if len(parts) == 2:
                    word, tag = parts
                    sentence.append((word, tag))
        if sentence:
            sentences.append(sentence)
    return sentences


root_data_Path = "NERDataset/"

raw_data_merged = []
for i in range(1, 6):
    dataPath = f"{root_data_Path}Persian-NER-part{i}.txt"
    print(f"Reading data from: {dataPath}")
    raw_data_merged.extend(read_ner_file(dataPath))

print(f"Total sentences read: {len(raw_data_merged)}")
for i, sentence in enumerate(raw_data_merged[:1]):
    print(f"Sentence {i + 1}:")
    for word, tag in sentence:
        print(f"{word}\t{tag}")
    print()

from collections import Counter

word_count = Counter()
tag_count = Counter()
tags = set()
for sentence in raw_data_merged:
    for word, tag in sentence:
        word_count[word] += 1
        tag_count[tag] += 1
        tags.add(tag)
print(f"Total unique words: {len(word_count)}")
print(f"Total unique tags: {len(tags)}")
print(f"Most common words: {word_count.most_common(10)}")

In [None]:
from datasets import Dataset
import random

unique_tags = sorted(list(tags))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

tokenized_sentences = []
for sentence in raw_data_merged:
    tokens = [word for word, tag in sentence]
    ner_tags = [tag2id[tag] for word, tag in sentence]
    tokenized_sentences.append({"tokens": tokens, "ner_tags": ner_tags})

random.shuffle(tokenized_sentences)
train_data = tokenized_sentences[: int(0.9 * len(tokenized_sentences))]
test_data = tokenized_sentences[int(0.9 * len(tokenized_sentences)) :]

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

In [None]:
from transformers import AutoTokenizer

model_ckpt = "shekar-ai/albert-base-v2-persian-zwnj-naab-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()

    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["ner_tags"][word_idx])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized["labels"] = label_ids
    return tokenized


train_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=False)

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_ckpt, num_labels=len(tag2id), id2label=id2tag, label2id=tag2id
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer
import evaluate

seqeval = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return seqeval.compute(predictions=true_predictions, references=true_labels)


training_args = TrainingArguments(
    output_dir="./ner-persian-model",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    report_to="tensorboard",
    hub_model_id="shekar-ai/albert-base-v2-persian-ner",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "shekar-ai/albert-base-v2-persian-ner"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline(
    "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple"
)

In [None]:
examples = [
    "علی‌رضا در تهران به دنیا آمد.",
    "شرکت دیجی‌کالا در سال ۱۳۸۵ تأسیس شد.",
    "مریم در دانشگاه شریف درس می‌خواند.",
    "ولادیمیر پوتین رئیس‌جمهور روسیه است.",
    "گوگل در کالیفرنیا قرار دارد.",
]

for sentence in examples:
    print(f"\n📌 Input: {sentence}")
    for ent in ner_pipeline(sentence):
        print(
            f"→ Entity: {ent['word']} | Label: {ent['entity_group']} | Score: {ent['score']:.2f}"
        )

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = (
    "shekar-ai/albert-base-v2-persian-ner"  # or your HuggingFace repo if pushed
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

text = "آقای علی‌رضا رضایی در تهران به دنیا می‌آمد. "
inputs = tokenizer(text, return_tensors="pt")

model.eval()

torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    "ner_persian_model.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_length"},
        "attention_mask": {0: "batch_size", 1: "seq_length"},
        "logits": {0: "batch_size", 1: "seq_length"},
    },
    opset_version=14,
)

print("✅ NER model exported to ner_persian_model.onnx")

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

input_onnx = "ner_persian_model.onnx"
output_onnx = "ner_persian_model_quantized.onnx"

quantize_dynamic(
    model_input=input_onnx, model_output=output_onnx, weight_type=QuantType.QInt8
)

print("✅ Quantized model saved as:", output_onnx)