In [5]:
import torch
from transformers import AutoTokenizer
from arabert.preprocess import ArabertPreprocessor
# for text classification
from transformers import AutoModelForSequenceClassification

## araBERT Preprocessor

In [16]:
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name)



100%|██████████| 241M/241M [03:26<00:00, 1.17MiB/s] 




## Initializing Model

In [None]:
checkpoint = "aubmindlab/bert-base-arabert"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def classify(text):
    text = arabert_prep.preprocess(text)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    return probs

## Inference

In [None]:
def classify(text):
    text = arabert_prep.preprocess(text)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    return probs

In [None]:
text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
probs = classify(text)
print(probs)

# Training Setup

## Dataset Prep

### AI

In [None]:
# import pandas as pd
# import numpy as np

# ai_df = pd.read_csv('../Tweets/AIArabicTweets.csv')
# ai_df.columns = ['text', 'label']
# # set all the labels to 0
# ai_df['label'] = 0
# ai_df = ai_df.drop(0)

# human_df = pd.read_csv('../Tweets/HumanArabicTweets.csv')
# human_df.columns = ['text', 'label']
# # set all the labels to 1
# human_df['label'] = 1

# # merge the two dataframes
# df = pd.concat([ai_df, human_df], ignore_index=True)
# df = df.sample(frac=1).reset_index(drop=True)
# # drop nan values
# df = df.dropna()

# # save the dataframe to a csv file
# df.to_csv('ArabicTweets.csv', index=False)


## Data Preprocessing

In [1]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="ArabicTweets.csv", split="train")

dataset = dataset.train_test_split(test_size=0.2)

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 136390
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 34098
    })
})


In [3]:
dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 136390
})

In [4]:
dataset["train"][0]

{'text': 'تحسب أنها فزّة قلب، وتطلع رهاب إجتماعي', 'label': 1}

### Applying araBERT Preprocessor

In [13]:
def preprocess_function(examples):
    arabic_prep = arabert_prep.preprocess(examples["text"])
    result = tokenizer(arabic_prep,truncation=True,   
                       max_length=512, return_overflowing_tokens=True)

    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result
    # return tokenizer(arabic_prep, truncation=True, max_length=512, padding="max_length")

In [17]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/136390 [00:01<?, ? examples/s]


NameError: name 'tokenizer' is not defined

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

NameError: name 'tokenizer' is not defined

## Setup Evaluation

In [18]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 4.41MB/s]


ImportError: To be able to use evaluate-metric/accuracy, you need to install the following dependencies['scikit-learn'] using 'pip install sklearn' for instance'

In [None]:
from sklearn.metrics import f1_score, recall_score, accuracy_score
import numpy as np
def calculate_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')

    return {"accuracy": acc, "f1": f1, "recall": recall}

## Training

In [19]:
# find number of labels
num_labels = len(set(dataset["train"]["label"]))
print(num_labels)

# get label names
label_names = ["AI", "Human"]
print(label_names)

2
['AI', 'Human']


In [20]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in id2label.items()}

In [21]:
print(id2label)
print(label2id)

{0: 'AI', 1: 'Human'}
{'AI': 0, 'Human': 1}


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)

In [None]:
batch_size = 1
exp = "1"
training_args = TrainingArguments(
    output_dir="trained/araBERT-base"+"_exp"+exp,
    learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    save_total_limit=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=False,
    logging_dir="logs/araBERT-base"+"_exp"+exp,
    logging_strategy = "epoch",
    logging_steps = 1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_metrics,
)

In [None]:
# trainer.train()

## Final Inference

In [6]:
from transformers import AutoTokenizer
model_path = "trained/araBERT-base_exp4/checkpoint-1000"
tokenizer = AutoTokenizer.from_pretrained(model_path)

text = dataset["train"][0]["text"]

inputs = tokenizer(text, return_tensors="pt")

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs.to(device)

with torch.no_grad():
    logits = model(**inputs).logits

In [10]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'Human'

### Evaluation

In [14]:
from tqdm.notebook import tqdm

def evaluate_test_set(model, tokenizer, test_dataset, batch_size=32):
    if torch.cuda.is_available():
        device = "cuda"
    else :
        device = "cpu"
    print(f"Device: {device}")
    model.to(device)
    model.eval()
    predictions = []
    labels = []
    accuracy = 0
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
    progress = tqdm(enumerate(test_dataloader), total=len(test_dataloader))
    for idx, batch in enumerate(test_dataloader):
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True, max_length=512)
        labels.extend(batch["label"])
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        scores = outputs.logits
        probs = scores.softmax(dim=1)
        predicted_class_ids = probs.argmax(dim=1).tolist()

        batch_accuracy = sum([1 if label == prediction else 0 for label, prediction in zip(batch["label"], predicted_class_ids)])
        batch_accuracy = batch_accuracy / len(batch["label"])

        accuracy += batch_accuracy

        predictions.extend(predicted_class_ids)

        progress.update(1)
        progress.set_description(f"Batch {idx+1}/{len(test_dataloader)}: {batch_accuracy*100:.2f}%")
        # print(f"Batch {idx+1}/{len(test_dataloader)}: {batch_accuracy}")


    accuracy = accuracy / len(test_dataloader)
    return accuracy, labels, predictions



In [15]:
for example in dataset["test"]:
    print(example["text"])
    print(example["label"])
    break

اللهُم إنشراح ، لا يعقبِه ضيق .
1


In [16]:
accuracy, labels, predictions =  evaluate_test_set(model, tokenizer, dataset["test"], batch_size=256)

Device: cuda


  0%|          | 0/134 [00:00<?, ?it/s]

In [55]:
accuracy, labels, predictions =  evaluate_test_set(model, tokenizer, dataset["test"], batch_size=256)

Device: cuda
Batch 1/134: 0.9296875
Batch 2/134: 0.9609375
Batch 3/134: 0.94140625
Batch 4/134: 0.94140625
Batch 5/134: 0.9609375
Batch 6/134: 0.9140625
Batch 7/134: 0.9296875
Batch 8/134: 0.94140625
Batch 9/134: 0.91015625
Batch 10/134: 0.9609375
Batch 11/134: 0.9609375
Batch 12/134: 0.92578125
Batch 13/134: 0.94140625
Batch 14/134: 0.953125
Batch 15/134: 0.91796875
Batch 16/134: 0.921875
Batch 17/134: 0.92578125
Batch 18/134: 0.9453125
Batch 19/134: 0.95703125
Batch 20/134: 0.953125
Batch 21/134: 0.92578125
Batch 22/134: 0.94140625
Batch 23/134: 0.9453125
Batch 24/134: 0.9140625
Batch 25/134: 0.95703125
Batch 26/134: 0.94921875
Batch 27/134: 0.93359375
Batch 28/134: 0.953125
Batch 29/134: 0.94921875
Batch 30/134: 0.9375
Batch 31/134: 0.9453125
Batch 32/134: 0.9453125
Batch 33/134: 0.9453125
Batch 34/134: 0.97265625
Batch 35/134: 0.96484375
Batch 36/134: 0.953125
Batch 37/134: 0.9296875
Batch 38/134: 0.94140625
Batch 39/134: 0.93359375
Batch 40/134: 0.953125
Batch 41/134: 0.90625
Batc

In [56]:
print(f"Final accuracy: {accuracy*100}%")

Final accuracy: 94.15706623134328%


In [None]:
from tqdm.notebook import tqdm

# create a progress bar
progress = tqdm(total=len(dataset["test"]))
# set the model to evaluation mode
