In [1]:
import torch
from transformers import AutoTokenizer
from arabert.preprocess import ArabertPreprocessor
# for text classification
from transformers import AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


## araBERT Preprocessor

In [2]:
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name)



## Initializing Model

In [3]:
checkpoint = "aubmindlab/bert-base-arabert"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def classify(text):
    text = arabert_prep.preprocess(text)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    return probs

## Inference

In [5]:
def classify(text):
    text = arabert_prep.preprocess(text)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    return probs

In [6]:
text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
probs = classify(text)
print(probs)

tensor([[0.6441, 0.3559]], grad_fn=<SoftmaxBackward0>)


# Training Setup

## Dataset Prep

### AI

In [46]:
import pandas as pd
import numpy as np

ai_df = pd.read_csv('../Tweets/AIArabicTweets.csv')
ai_df.head()

Unnamed: 0,Column1,Column2
0,Article text,Label
1,?? الإبداع لا يعترف بالحدود! اطلقوا خيالكم وتح...,0
2,?? لنكن بيئيين! قراراتنا اليوم تبني مستقبلنا، ...,0
3,?? القراءة تغذي العقل وتفتح آفاقاً جديدة. شارك...,0
4,?? فن الحياة: اكتشفوا جمال الفنون وشاركونا أعم...,0


In [47]:
# change the column names
ai_df.columns = ['text', 'label']
ai_df.head()

Unnamed: 0,text,label
0,Article text,Label
1,?? الإبداع لا يعترف بالحدود! اطلقوا خيالكم وتح...,0
2,?? لنكن بيئيين! قراراتنا اليوم تبني مستقبلنا، ...,0
3,?? القراءة تغذي العقل وتفتح آفاقاً جديدة. شارك...,0
4,?? فن الحياة: اكتشفوا جمال الفنون وشاركونا أعم...,0


In [48]:
# drop first row
ai_df = ai_df.drop(0)
ai_df.head()

Unnamed: 0,text,label
1,?? الإبداع لا يعترف بالحدود! اطلقوا خيالكم وتح...,0
2,?? لنكن بيئيين! قراراتنا اليوم تبني مستقبلنا، ...,0
3,?? القراءة تغذي العقل وتفتح آفاقاً جديدة. شارك...,0
4,?? فن الحياة: اكتشفوا جمال الفنون وشاركونا أعم...,0
5,??‍?? السلام الداخلي يبدأ بالتفكير الإيجابي وا...,0


In [49]:
# save the dataframe to a csv file
ai_df.to_csv('AIArabicTweets.csv', index=False)

### Human

In [50]:
human_df = pd.read_csv('../Tweets/HumanArabicTweets.csv')
human_df.head()

Unnamed: 0,Column1,Column5
0,'''صلي على النبي ?',
1,'''صلي على النبي ?'',
2,'''صلي على النبي ?''',
3,'' ماعندي الا قلبببّ واحد ومسروق,
4,"والسِارق الي سارقه مطيريٍ جايزٍ لي """"""",


In [51]:
human_df.columns = ['text', 'label']
human_df.head()

Unnamed: 0,text,label
0,'''صلي على النبي ?',
1,'''صلي على النبي ?'',
2,'''صلي على النبي ?''',
3,'' ماعندي الا قلبببّ واحد ومسروق,
4,"والسِارق الي سارقه مطيريٍ جايزٍ لي """"""",


In [52]:
# set all the labels to 1
human_df['label'] = 1
human_df.head()

Unnamed: 0,text,label
0,'''صلي على النبي ?',1
1,'''صلي على النبي ?'',1
2,'''صلي على النبي ?''',1
3,'' ماعندي الا قلبببّ واحد ومسروق,1
4,"والسِارق الي سارقه مطيريٍ جايزٍ لي """"""",1


In [54]:
# merge the two dataframes
df = pd.concat([ai_df, human_df], ignore_index=True)
#shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,text,label
0,اسمه lolita caffe,1
1,الحمد لله على هذا الفوز ...,1
2,الآن الآن,1
3,لا شيء يُقارن بتجربة اكتشاف أماكن جديدة وتذوق ...,0
4,اَلسَلامُ عَلَيْكُم وَرَحْمَةُ اَللهِ وَبَرَكا...,1


In [55]:
# drop nan values
df = df.dropna()
df.head()

Unnamed: 0,text,label
0,اسمه lolita caffe,1
1,الحمد لله على هذا الفوز ...,1
2,الآن الآن,1
3,لا شيء يُقارن بتجربة اكتشاف أماكن جديدة وتذوق ...,0
4,اَلسَلامُ عَلَيْكُم وَرَحْمَةُ اَللهِ وَبَرَكا...,1


In [56]:
# save the dataframe to a csv file
df.to_csv('ArabicTweets.csv', index=False)

In [58]:
arabic_df = pd.read_csv('ArabicTweets.csv')
arabic_df.head()

Unnamed: 0,text,label
0,اسمه lolita caffe,1
1,الحمد لله على هذا الفوز ...,1
2,الآن الآن,1
3,لا شيء يُقارن بتجربة اكتشاف أماكن جديدة وتذوق ...,0
4,اَلسَلامُ عَلَيْكُم وَرَحْمَةُ اَللهِ وَبَرَكا...,1


## Data Preprocessing

In [59]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="ArabicTweets.csv", split="train")

dataset = dataset.train_test_split(test_size=0.2)

Generating train split: 170488 examples [00:00, 230577.66 examples/s]


In [60]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 136390
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 34098
    })
})


In [61]:
dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 136390
})

In [62]:
dataset["train"][0]

{'text': 'ادعموا أهل #جزيره_الوراق تخيل نفسك مكانهم بتطرد من بيتك ولو رفضت بتضرب وتتحبس تكلموا عنهم ليل نهار لعل الظلم يتوقف',
 'label': 1}

### Applying araBERT Preprocessor

In [63]:
def preprocess_function(examples):
    arabic_prep = arabert_prep.preprocess(examples["text"])
    result = tokenizer(arabic_prep,truncation=True,   
                       max_length=512, return_overflowing_tokens=True)

    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result
    # return tokenizer(arabic_prep, truncation=True, max_length=512, padding="max_length")

In [64]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 136390/136390 [04:28<00:00, 507.34 examples/s]
Map: 100%|██████████| 34098/34098 [01:47<00:00, 317.18 examples/s]


In [65]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Setup Evaluation

In [66]:
import evaluate

accuracy = evaluate.load("accuracy")

In [67]:
from sklearn.metrics import f1_score, recall_score, accuracy_score
import numpy as np
def calculate_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')

    return {"accuracy": acc, "f1": f1, "recall": recall}

## Training

In [68]:
# find number of labels
num_labels = len(set(dataset["train"]["label"]))
print(num_labels)

# get label names
label_names = ["AI", "Human"]
print(label_names)

2
['AI', 'Human']


In [69]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in id2label.items()}

In [70]:
print(id2label)
print(label2id)

{0: 'AI', 1: 'Human'}
{'AI': 0, 'Human': 1}


In [71]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
# save the model
model.save_pretrained("dummy_model")
tokenizer.save_pretrained("dummy_model")


('dummy_model\\tokenizer_config.json',
 'dummy_model\\special_tokens_map.json',
 'dummy_model\\vocab.txt',
 'dummy_model\\added_tokens.json',
 'dummy_model\\tokenizer.json')

In [73]:
batch_size = 1
exp = "1"
training_args = TrainingArguments(
    output_dir="trained/araBERT-base"+"_exp"+exp,
    learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    save_total_limit=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=False,
    logging_dir="logs/araBERT-base"+"_exp"+exp,
    logging_strategy = "epoch",
    logging_steps = 1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_metrics,
)

In [74]:
trainer.train()

  0%|          | 1/106200 [00:05<174:49:45,  5.93s/it]

KeyboardInterrupt: 

## Final Inference

In [76]:
from transformers import AutoTokenizer
model_path = "./dummy_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)

text = dataset["train"][0]["text"]

inputs = tokenizer(text, return_tensors="pt")

In [77]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_path)
with torch.no_grad():
    logits = model(**inputs).logits

In [78]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'AI'