# NER for arabic

### model 1
https://huggingface.co/CAMeL-Lab/bert-base-arabic-camelbert-msa-ner

load model

In [1]:
#!pip install camel_tools
from camel_tools.ner import NERecognizer
from camel_tools.tokenizers.word import simple_word_tokenize
ner = NERecognizer('CAMeL-Lab/bert-base-arabic-camelbert-msa-ner')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-msa-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identica

use model

In [2]:
text = """سافر ليلى إلى مدينة فاس لزيارة أختها سارة تجولت ليلى في شوارع المدينة القديمة وقد أعجبت بجمالها وتاريخها بعد ذلك سافرت إلى مدينة مراكش لمشاهدة السوق المشهور والقصر الجميل"""

sentence = simple_word_tokenize(text)
doc = ner.predict_sentence(sentence)


In [3]:

for tag , word in zip(doc, sentence):
    print("word : {0:20} , tag : ".format(word), tag)

word : سافر                 , tag :  O
word : ليلى                 , tag :  B-PERS
word : إلى                  , tag :  O
word : مدينة                , tag :  O
word : فاس                  , tag :  B-LOC
word : لزيارة               , tag :  O
word : أختها                , tag :  O
word : سارة                 , tag :  B-PERS
word : تجولت                , tag :  O
word : ليلى                 , tag :  B-PERS
word : في                   , tag :  O
word : شوارع                , tag :  O
word : المدينة              , tag :  O
word : القديمة              , tag :  O
word : وقد                  , tag :  O
word : أعجبت                , tag :  O
word : بجمالها              , tag :  O
word : وتاريخها             , tag :  O
word : بعد                  , tag :  O
word : ذلك                  , tag :  O
word : سافرت                , tag :  O
word : إلى                  , tag :  O
word : مدينة                , tag :  O
word : مراكش                , tag :  B-LOC
word : لمشاهدة              , tag :  O
wo

# model 2: stanza

https://stanfordnlp.github.io/stanza/

In [4]:
#!pip install stanza
import stanza

stanza.download("ar")
nlp = stanza.Pipeline("ar")



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ar (Arabic) ...
INFO:stanza:File exists: /root/stanza_resources/ar/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [5]:
doc = nlp.process(text)

In [6]:
for word in doc.entities:
    print(word)

{
  "text": "فاس",
  "type": "LOC",
  "start_char": 20,
  "end_char": 23
}
{
  "text": "مراكش",
  "type": "LOC",
  "start_char": 129,
  "end_char": 134
}


# model 3 : arabic-ner

https://huggingface.co/hatmimoha/arabic-ner

In [7]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="hatmimoha/arabic-ner")

In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("hatmimoha/arabic-ner")
model = AutoModelForTokenClassification.from_pretrained("hatmimoha/arabic-ner")

In [9]:
import torch
inputs = tokenizer(text,add_special_tokens=False, return_tensors="pt")
words = tokenizer.tokenize(text, add_special_tokens=False, return_tensors="pt")


with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

In [10]:
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]


for word, tag in zip(words, predicted_tokens_classes):
    print("word : {0:20} , tag : ".format(word), tag)

word : سافر                 , tag :  O
word : ليلى                 , tag :  B-PERSON
word : الى                  , tag :  O
word : مدينة                , tag :  O
word : فاس                  , tag :  B-LOCATION
word : لزيارة               , tag :  O
word : اخته                 , tag :  O
word : ##ا                  , tag :  O
word : سارة                 , tag :  B-PERSON
word : تج                   , tag :  O
word : ##ولت                , tag :  O
word : ليلى                 , tag :  B-PERSON
word : في                   , tag :  O
word : شوارع                , tag :  O
word : المدينة              , tag :  O
word : القديمة              , tag :  O
word : وقد                  , tag :  O
word : اعجب                 , tag :  O
word : ##ت                  , tag :  O
word : بجمال                , tag :  O
word : ##ها                 , tag :  O
word : وتاريخ               , tag :  O
word : ##ها                 , tag :  O
word : بعد                  , tag :  O
word : ذلك                  , tag 