<a href="https://colab.research.google.com/github/saragamilmohamed/Arabic-Part-of-speech/blob/main/Arabic_part_of_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git



# Token classification

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load dataset

In [None]:
pip install pyconll



In [None]:
import pyconll
data= pyconll.load_from_file("/content/Arabic_POS.conllu")

Then take a look at an example:

In [None]:
sentences = []
labels = []

for sentence in data:
    tokens = []
    pos_tags = []
    for token in sentence:
        if token.form and token.upos:
            tokens.append(token.form)
            pos_tags.append(token.upos)
    sentences.append(tokens)
    labels.append(pos_tags)

In [None]:
sentences[0]

['برلين',
 'ترفض',
 'حصول',
 'شركة',
 'اميركية',
 'على',
 'رخصة',
 'تصنيع',
 'دبابة',
 '"',
 'ليوبارد',
 '"',
 'الالمانية']

In [None]:
labels[0]

['X',
 'VERB',
 'NOUN',
 'NOUN',
 'ADJ',
 'ADP',
 'NOUN',
 'NOUN',
 'NOUN',
 'PUNCT',
 'X',
 'PUNCT',
 'ADJ']

In [None]:
len(sentences),len(labels)

(6075, 6075)

## Splitting data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(sentences, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)


In [None]:
X_train[0]

['كشفت',
 'الأوراق',
 'و',
 'المستندات',
 'أن',
 'المستشفى',
 'تهرب',
 'من',
 'طلبات',
 'المستشفيات',
 'الحكومية',
 'من',
 'الدم',
 'ب',
 'اعتبار',
 'ها',
 'تأخذ',
 'الدم',
 'مجاناً',
 'و',
 'كان',
 'يتم',
 'إبلاغ',
 'غرفة',
 'الطوارئ',
 'ب',
 'وزارة',
 'الصحة',
 'ب',
 'كميات',
 'دم',
 'أقل',
 'من',
 'ما',
 'هو',
 'مدون',
 'ب',
 'الأوراق',
 'الرسمية',
 'ل',
 'عدم',
 'الاستعانة',
 'ب',
 'كميات',
 'الدم',
 'في',
 'الحوادث',
 'و',
 'حالات',
 'النزيف',
 'الحاد',
 '.']

In [None]:
len(X_train), len(X_val), len(X_test)

(4860, 850, 365)

In [None]:
unique_labels = sorted(set(tag for sent in y_train for tag in sent))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

In [None]:
id2label

{0: 'ADJ',
 1: 'ADP',
 2: 'ADV',
 3: 'AUX',
 4: 'CCONJ',
 5: 'DET',
 6: 'INTJ',
 7: 'NOUN',
 8: 'NUM',
 9: 'PART',
 10: 'PRON',
 11: 'PROPN',
 12: 'PUNCT',
 13: 'SCONJ',
 14: 'SYM',
 15: 'VERB',
 16: 'X'}

The next step is to load a DistilBERT tokenizer to preprocess the `tokens` field:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
example = sentences[0]
tokenized_input = tokenizer(example, is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'برلين',
 'ترفض',
 'حصول',
 'شركة',
 'اميركية',
 'على',
 'رخصة',
 'تصنيع',
 'دب',
 '##ابة',
 '"',
 'ليو',
 '##بار',
 '##د',
 '"',
 'الالمانية',
 '[SEP]']

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length"
    )

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id.get(label[word_idx], -100))
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd


train_df = pd.DataFrame({'tokens': X_train, 'ner_tags': y_train})

val_df = pd.DataFrame({'tokens': X_val, 'ner_tags': y_val})

test_df = pd.DataFrame({'tokens': X_test, 'ner_tags': y_test})

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_data= DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})


tokenized_data = tokenized_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4860 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/365 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [None]:
data_collator

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='asafaya/bert-base-arabic', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None

## Evaluate

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=00d419a08f14ca8b575b6366fd581dd99e3566077020118901a60e677f2b90c4
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Get the NER labels first, and then create a function that passes your true predictions and true labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the scores:

In [None]:
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape
    true_labels = []
    true_predictions = []

    for i in range(batch_size):
        pred_tags = []
        true_tags = []

        for j in range(seq_len):
            if label_ids[i][j] != -100:
                true_tags.append(id2label[label_ids[i][j]])
                pred_tags.append(id2label[preds[i][j]])

        true_labels.append(true_tags)
        true_predictions.append(pred_tags)

    return true_predictions, true_labels


Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

In [None]:
def compute_metrics(p):
    predictions, labels = p
    preds, refs = align_predictions(predictions, labels)

    return {
        "accuracy": accuracy_score(refs, preds),
        "f1": f1_score(refs, preds),
        "precision": precision_score(refs, preds),
        "recall": recall_score(refs, preds),
    }


## Train

In [None]:
from transformers import create_optimizer

batch_size = 16
num_train_epochs = 3
num_train_steps = (len(tokenized_data["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [None]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "asafaya/bert-base-arabic", num_labels=17, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
from transformers.keras_callbacks import PushToHubCallback
from huggingface_hub import HfFolder, notebook_login, login


login()


token = HfFolder.get_token()

push_to_hub_callback = PushToHubCallback(
    output_dir="saraaaaaaaaaaaaaa/first_pos_project",
    tokenizer=tokenizer,
    hub_token=token,
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/saraaaaaaaaaaaaaa/first_pos_project into local empty directory.


Download file tf_model.h5:   0%|          | 6.86k/420M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/420M [00:00<?, ?B/s]

Then bundle your callbacks together:

In [None]:
callbacks = [metric_callback, push_to_hub_callback]

In [None]:
!git config --global user.email "sgamil677@gmail.com"
!git config --global user.name "saraaaaaaaaaaaaaa"

In [None]:
import numpy as np

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

Epoch 1/3

Great, now that you've finetuned a model, you can use it for inference!

Grab some text you'd like to run inference on:

In [None]:
text = "هنا الدُّنيا وهُناك .. هناك الجنَّة حيثُ لَا شَيء يَسلُبه الزَّمَانُ مِنك"

In [None]:
from transformers import pipeline

classifier = pipeline("ner", model="asafaya/bert-base-arabic")
classifier(text)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
inputs = tokenizer(text, return_tensors="tf")

Pass your inputs to the model and return the `logits`:

In [None]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained("asafaya/bert-base-arabic")
logits = model(**inputs).logits

In [None]:
predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
predicted_token_class

## Evaluate Model

In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report

# Prepare test data
tf_test_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


all_predictions = []
all_labels = []

max_len = 0
for batch in tqdm(tf_test_set, desc="Processing batches"):

    predictions = model.predict(batch)
    logits = predictions.logits
    label_ids = batch[1].numpy()


    max_len = max(max_len, logits.shape[1], label_ids.shape[1])


    all_predictions.append(logits)
    all_labels.append(label_ids)


all_predictions = np.concatenate(all_predictions, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


padded_predictions = np.pad(all_predictions, ((0, 0), (0, max_len - all_predictions.shape[1]), (0, 0)), 'constant', constant_values=0)
padded_labels = np.pad(all_labels, ((0, 0), (0, max_len - all_labels.shape[1])), 'constant', constant_values=-100)


y_pred, y_true = align_predictions(padded_predictions, padded_labels)

print(classification_report(y_true, y_pred))
