In [6]:
import torch
from transformers import AutoTokenizer
from arabert.preprocess import ArabertPreprocessor
# for text classification
from transformers import AutoModelForSequenceClassification

## araBERT Preprocessor

In [4]:
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name)



100%|██████████| 241M/241M [04:11<00:00, 959kiB/s]  




## Initializing Model

In [52]:
checkpoint = "aubmindlab/bert-base-arabert"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
def classify(text):
    text = arabert_prep.preprocess(text)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    return probs

## Inference

In [None]:
def classify(text):
    text = arabert_prep.preprocess(text)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    return probs

In [11]:
text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
probs = classify(text)
print(probs)

tensor([[0.4394, 0.5606]], grad_fn=<SoftmaxBackward0>)


# Training Setup

## Data Preprocessing

In [13]:
from datasets import load_dataset

dataset = load_dataset("metrec")

Downloading data:   0%|          | 0.00/3.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/598k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [14]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 47124
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8316
    })
})


In [15]:
dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 47124
})

In [16]:
dataset["train"][0]

{'text': 'يا ضيفَ طيفٍ ما هَداهُ لمَضجَعي # إلا لهيبٌ في الحشى يتوقّدُ',
 'label': 1}

### Applying araBERT Preprocessor

In [31]:
def preprocess_function(examples):
    arabic_prep = arabert_prep.preprocess(examples["text"])
    result = tokenizer(arabic_prep,truncation=True,   
                       max_length=512, return_overflowing_tokens=True)

    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result
    # return tokenizer(arabic_prep, truncation=True, max_length=512, padding="max_length")

In [32]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/47124 [00:00<?, ? examples/s]

Map:   0%|          | 0/8316 [00:00<?, ? examples/s]

In [33]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Setup Evaluation

In [34]:
import evaluate

accuracy = evaluate.load("accuracy")

In [35]:
from sklearn.metrics import f1_score, recall_score, accuracy_score
import numpy as np
def calculate_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')

    return {"accuracy": acc, "f1": f1, "recall": recall}

## Training

In [40]:
# find number of labels
num_labels = len(set(dataset["train"]["label"]))
print(num_labels)

# get label names
label_names = dataset["train"].features["label"].names
print(label_names)

14
['saree', 'kamel', 'mutakareb', 'mutadarak', 'munsareh', 'madeed', 'mujtath', 'ramal', 'baseet', 'khafeef', 'taweel', 'wafer', 'hazaj', 'rajaz']


In [41]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in id2label.items()}

In [42]:
print(id2label)
print(label2id)

{0: 'saree', 1: 'kamel', 2: 'mutakareb', 3: 'mutadarak', 4: 'munsareh', 5: 'madeed', 6: 'mujtath', 7: 'ramal', 8: 'baseet', 9: 'khafeef', 10: 'taweel', 11: 'wafer', 12: 'hazaj', 13: 'rajaz'}
{'saree': 0, 'kamel': 1, 'mutakareb': 2, 'mutadarak': 3, 'munsareh': 4, 'madeed': 5, 'mujtath': 6, 'ramal': 7, 'baseet': 8, 'khafeef': 9, 'taweel': 10, 'wafer': 11, 'hazaj': 12, 'rajaz': 13}


In [59]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# save the model
model.save_pretrained("dummy_model")
tokenizer.save_pretrained("dummy_model")


In [45]:
batch_size = 1
exp = "1"
training_args = TrainingArguments(
    output_dir="trained/araBERT-base"+"_exp"+exp,
    learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    save_total_limit=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    logging_dir="logs/araBERT-base"+"_exp"+exp,
    logging_strategy = "epoch",
    logging_steps = 1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_metrics,
)

In [46]:
# trainer.train()

## Final Inference

In [56]:
from transformers import AutoTokenizer
model_path = "./dummy_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)

text = dataset["train"][0]["text"]

inputs = tokenizer(text, return_tensors="pt")

In [57]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_path)
with torch.no_grad():
    logits = model(**inputs).logits

In [58]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'LABEL_1'