In [None]:
import torch
from transformers import AutoTokenizer
from arabert.preprocess import ArabertPreprocessor
# for text classification
from transformers import AutoModelForSequenceClassification

## araBERT Preprocessor

In [None]:
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name)

## Initializing Model

In [None]:
checkpoint = "aubmindlab/bert-base-arabertv02-twitter"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## Inference

In [None]:
def classify(text):
    text = arabert_prep.preprocess(text)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    return probs

In [None]:
text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
probs = classify(text)
print(probs)

# Training Setup

## Dataset Prep

In [None]:
# import pandas as pd
# import numpy as np

# ai_df = pd.read_csv('../Tweets/AIArabicTweets.csv')
# ai_df.columns = ['text', 'label']
# # set all the labels to 0
# ai_df['label'] = 0
# ai_df = ai_df.drop(0)

# human_df = pd.read_csv('../Tweets/HumanArabicTweets.csv')
# human_df.columns = ['text', 'label']
# # set all the labels to 1
# human_df['label'] = 1

# # merge the two dataframes
# df = pd.concat([ai_df, human_df], ignore_index=True)
# df = df.sample(frac=1).reset_index(drop=True)
# # drop nan values
# df = df.dropna()

# # save the dataframe to a csv file
# df.to_csv('ArabicTweets.csv', index=False)


## Data Preprocessing

In [None]:
from datasets import load_dataset
from datasets import concatenate_datasets

dataset = load_dataset("csv", data_files="ArabicTweets.csv", split="train", encoding="windows-1256")
# find number of examples for each label
human = sum([1 for label in dataset["label"] if label == 1])
ai = sum([1 for label in dataset["label"] if label == 0])

print("human: ", human,"ai: ", ai)
# balance the dataset

dataset = dataset.shuffle()

human_dataset = dataset.filter(lambda example: example["label"] == 1)
ai_dataset = dataset.filter(lambda example: example["label"] == 0)
filtered_human_dataset = human_dataset.select(range(ai))

balanced_dataset = concatenate_datasets([filtered_human_dataset, ai_dataset])
dataset = balanced_dataset.train_test_split(test_size=0.2)

In [None]:
human = sum([1 for label in balanced_dataset["label"] if label == 1])
ai = sum([1 for label in balanced_dataset["label"] if label == 0])

print("human: ", human,"ai: ", ai)

In [None]:
print(dataset)

In [None]:
dataset["train"]

In [None]:
dataset["train"][0]

In [None]:
# check average length of the tweets
import numpy as np
lengths = [len(tweet) for tweet in dataset["train"]["text"]]
print(np.mean(lengths))
print(np.max(lengths))
print(np.min(lengths))
# find number of tweets having 
num = sum([1 for tweet in dataset["train"]["text"] if len(tweet) > 100])
print(num)

### Applying araBERT Preprocessor

In [None]:
def arabic_preprocess(examples):
    return {"text":arabert_prep.preprocess(examples["text"]), "label":examples["label"]}

In [None]:
dataset["train"][0:4]

In [None]:
dataset = dataset.map(arabic_preprocess)

In [None]:
dataset["train"][0:4]

In [None]:
dataset["train"][1]

In [None]:
import matplotlib.pyplot as plt
print("Training Sentence Lengths: ")
plt.hist([ len(tokenizer.tokenize(sentence)) for sentence in dataset["train"]["text"]],bins=range(0,128,2))
plt.show()

print("Testing Sentence Lengths: ")
plt.hist([ len(tokenizer.tokenize(sentence)) for sentence in dataset["test"]["text"]],bins=range(0,128,2))
plt.show()

In [None]:
def preprocess_function(examples):
    result = tokenizer(examples["text"],truncation=True,   
                       max_length=128, return_overflowing_tokens=True)

    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result
    # return tokenizer(arabic_prep, truncation=True, max_length=512, padding="max_length")

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Model Instantiation

In [None]:
# find number of labels
num_labels = len(set(dataset["train"]["label"]))
print(num_labels)

# get label names
label_names = ["AI", "Human"]
print(label_names)

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in id2label.items()}

In [None]:
print(id2label)
print(label2id)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)

In [None]:
model.config.hidden_dropout = 0.2

In [None]:
print(model.config)

## Custom Training Loop

### Hyper parameters

In [None]:
LEARNING_RATE = 5e-2
BATCH_SIZE = 16

### Data Prep

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset["train"].column_names

In [None]:
batch_size = BATCH_SIZE
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["test"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

### Training

In [None]:
from tqdm.auto import tqdm
# from transformers import AdamW
from transformers import get_scheduler
from torch.utils.tensorboard import SummaryWriter
import evaluate
from torch.optim import AdamW
exp = "5"

writer = SummaryWriter(log_dir="./logs/araBERT-base_exp"+exp)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
metric = evaluate.load("glue", "mrpc", device=device)

# optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)


# lr_scheduler
num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))
model.to(device)
for epoch in range(num_epochs):
    # training epoch
    train_loss = 0
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        scores = outputs.logits
        probs = scores.softmax(dim=1)

        predictions = torch.argmax(probs, dim=-1)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        metric.add_batch(predictions=predictions, references=batch["labels"])
        train_loss += loss.item()
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=loss.item())

    train_out = metric.compute()
    train_accuracy = train_out["accuracy"]
    train_f1 = train_out["f1"]
    train_loss = train_loss / len(train_dataloader)
    print(f"Epoch {epoch} - Train loss: {train_loss:.4f}, accuracy: {train_accuracy:.4f}, f1: {train_f1:.4f}")
    
    # Write to tensorboard
    writer.add_scalar("train/loss", train_loss, epoch)
    writer.add_scalar("train/accuracy", train_accuracy, epoch)
    writer.add_scalar("train/f1", train_f1, epoch)

    # evaluating epoch
    model.eval()
    eval_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        scores = outputs.logits
        probs = scores.softmax(dim=1)
        predictions = torch.argmax(probs, dim=-1)
        eval_loss += loss.item()
        metric.add_batch(predictions=predictions, references=batch["labels"])

    eval_loss = eval_loss / len(eval_dataloader)
    eval_out = metric.compute()
    eval_accuracy = eval_out["accuracy"]
    eval_f1 = eval_out["f1"]
    # eval_accuracy, eval_f1, recall, precision, eval_loss =  evaluate_test_set(model, tokenizer, dataset["test"], batch_size=256)
    print(f"Epoch {epoch} - Eval loss: {eval_loss:.4f}, accuracy: {eval_accuracy:.4f}, f1: {eval_f1:.4f}")

    # Write to tensorboard
    writer.add_scalar("eval/loss", eval_loss, epoch)
    writer.add_scalar("eval/accuracy", eval_accuracy, epoch)
    writer.add_scalar("eval/f1", eval_f1, epoch)

    # progress_bar.update(len(train_dataloader))
    # add text to the progress bar
    progress_bar.set_postfix(epochs = epoch,
        train_loss=train_loss, eval_loss=eval_loss, train_acc=train_accuracy, eval_acc=eval_accuracy
    )

    model.save_pretrained("./trained/araBERT-base_exp"+exp+"/checkpoint-"+str((epoch+1)*len(train_dataloader)))
    tokenizer.save_pretrained("./trained/araBERT-base_exp"+exp+"/checkpoint-"+str((epoch+1)*len(train_dataloader)))


# close the tensorboard writer
writer.close()

# save the model
# model.save_pretrained("./trained/araBERT-base_exp"+exp)
# tokenizer.save_pretrained("./trained/araBERT-base_exp"+exp)

## Hugging face trainer

In [None]:
import evaluate
import numpy as np
from sklearn.metrics import f1_score

metric = evaluate.load("glue", "sst2", device="cuda")
def calculate_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, predictions, average="weighted")
    metric_computed = metric.compute(predictions=predictions, references=labels)

    return {"accuracy": metric_computed["accuracy"], "f1":f1}

In [None]:
batch_size = 32
epochs = 20
exp = "2"
step = int(len(tokenized_dataset["train"])/(batch_size))
training_args = TrainingArguments(
    output_dir="./trained/araBERT-base"+"_exp"+exp,
    learning_rate=5e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=2*batch_size,
    num_train_epochs=epochs,
    save_total_limit=3,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=step,
    save_strategy="steps",
    save_steps=step,
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    logging_dir="./logs/araBERT-base"+"_exp"+exp,
    logging_strategy = "steps",
    logging_steps = step
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_metrics,
)

In [None]:
trainer.train()

## Final Inference

In [7]:
from datasets import load_dataset
import torch

dataset = load_dataset("csv", data_files="ArabicTweets.csv", split="train[:5%]", encoding="windows-1256")

dataset = dataset.train_test_split(test_size=0.2)

In [8]:
from transformers import AutoTokenizer
model_path = "trained/araBERT-base_exp4/checkpoint-21600"
tokenizer = AutoTokenizer.from_pretrained(model_path)

text = dataset["train"][0]["text"]

inputs = tokenizer(text, return_tensors="pt")

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs.to(device)

with torch.no_grad():
    logits = model(**inputs).logits

In [4]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'Human'

### Evaluation

In [12]:
model_name = "bert-base-arabert"
from arabert.preprocess import ArabertPreprocessor
arabic_prep = ArabertPreprocessor(model_name=model_name)
def arabic_preprocess(examples):
    return {"text":arabert_prep.preprocess(examples["text"]), "label":examples["label"]}



In [13]:
dataset = dataset.map(arabic_preprocess)

Map:   0%|          | 0/6688 [00:00<?, ? examples/s]

Map:   0%|          | 0/1673 [00:00<?, ? examples/s]

In [14]:
from tqdm.notebook import tqdm
import evaluate
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
metric = evaluate.load("glue", "sst2", device=device)
# import precision_recall_fscore_support
def evaluate_test_set(model, tokenizer, test_dataset, batch_size=32, pbar = False):
    if torch.cuda.is_available():
        device = "cuda"
    else :
        device = "cpu"
    print(f"Device: {device}")
    model.to(device)
    model.eval()
    acc = 0
    f1 = 0
    recall = 0
    precision = 0

    predictions = []
    labels = []

    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
    if pbar:
        progress = tqdm(enumerate(test_dataloader), total=len(test_dataloader))
    for idx, batch in enumerate(test_dataloader):
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        scores = outputs.logits
        probs = scores.softmax(dim=1)
        predicted_class_ids = probs.argmax(dim=1).tolist()

        predictions.extend(predicted_class_ids)
        batch_labels = batch["label"]
        labels.extend([x.item() for x in batch_labels])
        unique_labels = np.unique(predicted_class_ids)

        # batch_accuracy = sum([1 if label == prediction else 0 for label, prediction in zip(batch_labels, predicted_class_ids)])
        # batch_accuracy = batch_accuracy / len(batch_labels)

        batch_metric = metric.compute(predictions=predicted_class_ids, references=batch_labels)
        batch_acc = batch_metric["accuracy"]
        batch_f1 = f1_score(batch_labels, predicted_class_ids, average='weighted', labels=unique_labels)
        batch_recall = recall_score(batch_labels, predicted_class_ids, average='weighted', labels=unique_labels)
        batch_precision = precision_score(batch_labels, predicted_class_ids, average='weighted', labels=unique_labels)
        # print(batch_accuracy, batch_f1, batch_recall, batch_precision)
    
        acc += batch_metric["accuracy"]
        f1 += batch_f1
        recall += batch_recall
        precision += batch_precision

        if pbar:
            progress.update(1)
            progress.set_description(f"Batch {idx+1}/{len(test_dataloader)}: {batch_acc*100:.2f}%")
        # print(f"Batch {idx+1}/{len(test_dataloader)}: {batch_accuracy}")


    acc = acc / len(test_dataloader)
    f1 = f1 / len(test_dataloader)
    recall = recall / len(test_dataloader)
    precision = precision / len(test_dataloader)
    return acc, f1, recall, precision, predictions, labels


In [15]:
for example in dataset["test"]:
    print(example["text"])
    print(example["label"])
    break

* مع مهروس ال+ ميراميه ال+ خضراء خارجي +ا ل+ علاج عقص ال+ حشر +ات
1


In [16]:
accuracy, f1, recall, precision, predictions, labels =  evaluate_test_set(model, tokenizer, dataset["test"], batch_size=128, pbar=True)

Device: cuda


  0%|          | 0/14 [00:00<?, ?it/s]

In [17]:
print(f"Accuracy: {accuracy*100:.2f}%, F1: {f1*100:.2f}%, Recall: {recall*100:.2f}%, Precision: {precision*100:.2f}%")

Accuracy: 94.75%, F1: 97.29%, Recall: 100.00%, Precision: 94.75%


In [18]:
len(predictions), len(labels)
print(set(predictions))
print(set(labels))

{1}
{0, 1}


In [19]:
from sklearn.metrics import classification_report
cr = classification_report(labels, predictions, labels=[0, 1], target_names=["AI", "Human"])
print(cr)

              precision    recall  f1-score   support

          AI       0.00      0.00      0.00        94
       Human       0.94      1.00      0.97      1579

    accuracy                           0.94      1673
   macro avg       0.47      0.50      0.49      1673
weighted avg       0.89      0.94      0.92      1673



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
