In [None]:
from datasets import load_dataset
import json

# --- Load CSV (expects a column named "persian") ---
ds = load_dataset("csv", data_files="dara/test.csv", split="train")

# Take first 10 rows
ds_sample = ds.select(range(min(50, len(ds))))

# --- NER pipeline (token classification) ---




In [19]:
from transformers import pipeline

ner_fa = pipeline(
    "token-classification",
    model="HooshvareLab/bert-fa-base-uncased-ner-peyma",
    tokenizer="HooshvareLab/bert-fa-base-uncased-ner-peyma",
    aggregation_strategy="simple"
)

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased-ner-peyma were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [20]:
# --- Run NER in batches and JSON-serialize safely (only label+text) ---
def run_ner(batch):
    texts = batch["persian"]                  # list[str]
    ents_list = ner_fa(texts)                 # list[list[dict]] ; aggregation_strategy="simple"

    def norm_label(lbl: str) -> str:
      if not lbl:
          return ""
      s = str(lbl).strip()
      if s[:2] in ("B_", "I_", "B-", "I-"):
          s = s[2:]
      s = s.upper()

      alias = {
          "PER": "PERSON", "PERS": "PERSON", "PERSON": "PERSON",
          "ORG": "ORG", "ORGANIZATION": "ORG",
          "LOC": "LOCATION", "LOCATION": "LOCATION", "GPE": "LOCATION",
          "FAC": "FACILITY", "FACILITY": "FACILITY",
          "DAT": "DATE", "DATE": "DATE",
          "TIM": "TIME", "TIME": "TIME",
          "MON": "MONEY", "MONEY": "MONEY",
          "PCT": "PERCENT", "PERCENT": "PERCENT",
          "QUANTITY": "QUANTITY", "CARDINAL": "CARDINAL", "ORDINAL": "ORDINAL",
          "MISC": "MISC", "EVENT": "EVENT", "PRODUCT": "PRODUCT",
          "WORK_OF_ART": "WORK_OF_ART", "LAW": "LAW", "LANGUAGE": "LANGUAGE",
          "NORP": "NORP",
      }
      return alias.get(s, s)


    def to_minimal(ents):
        return [
            {"label": norm_label(e.get("entity_group") or e.get("entity") or ""),
             "text":  (e.get("word") or e.get("text") or "").strip()}
            for e in (ents or [])
            if (e.get("entity_group") or e.get("entity")) and (e.get("word") or e.get("text"))
        ]

    batch["fa_ner"] = [json.dumps(to_minimal(ents), ensure_ascii=False) for ents in ents_list]
    return batch

ds_out = ds_sample.map(run_ner, batched=True, batch_size=32)

# --- Save ---
ds_out.to_csv("fa_ner.csv")
print("Saved fa_ner.csv with column 'fa_ner' containing only label+text per entity.")


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved fa_ner.csv with column 'fa_ner' containing only label+text per entity.
