#### Load Dependencies

In [1]:
!pip -qqq install wandb
!pip -qqq install arabert
!pip -qqq install pyarabic
!pip -qqq install evaluate
!pip -qqq install arabic-reshaper
!pip -qqq install datasets==2.18.0
!pip -qqq install --upgrade transformers
!pip -qqq install peft bitsandbytes accelerate trl

#### Import Libraries

In [2]:
import gc
import os

import numpy as np
import pandas as pd

import torch
import wandb
import torch.nn as nn
from datasets import load_dataset
from trl import ORPOConfig, ORPOTrainer, setup_chat_format

from transformers import BitsAndBytesConfig
from arabert.preprocess import ArabertPreprocessor
from peft import LoraConfig, TaskType, get_peft_model
from transformers.data.processors.utils import InputFeatures
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertTokenizer,
    AutoConfig,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    Trainer
)

from sklearn.manifold import TSNE
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_score, f1_score, confusion_matrix

import wandb
import evaluate
import arabic_reshaper
from datasets import Dataset
from peft import LoraConfig, TaskType

device = 'cuda' if torch.cuda.is_available() else 'cpu'

#### Get the data and split it:

In [17]:
main_dir = 'D:/NLP Project/Arabic-Dialect-Identification-using-LLAMA-3/data/'
train = pd.read_csv(main_dir + "train_cleaned.csv")
test = pd.read_csv(main_dir + "test_cleaned.csv")

In [19]:
map_dict = {
    "United Arab Emirates":0,
    "Bahrain":1,
    "Algeria":2,
    "Egypt":3,
    "Iraq":4,
    "Jordan":5,
    "Kuwait":6,
    "Lebanon":7,
    "Libya":8,
    "Morocco":9,
    "Oman":10,
    "Palestine":11,
    "Qatar":12,
    "Saudi Arabia":13,
    "Sudan":14,
    "Syria":15,
    "Tunisia":16,
    "Yemen":17,
}
dict_label_map = { v: k for k, v in map_dict.items() }
num_labels = len(map_dict)

idx_mapper = lambda x: map_dict[x]
train['label'] = train['label'].apply(idx_mapper)

# Filter the DataFrame and then apply the mapping function
test_filtered = test[test['label'] != "Modern Standard Arabic"].copy()
test_filtered['label'] = test_filtered['label'].apply(idx_mapper)

# Assign the mapped values back to the original DataFrame
test.loc[test['label'] != "Modern Standard Arabic", 'label'] = test_filtered['label']

# Create splits (Train, Validation, Test)
train_df, val_df = train_test_split(train, test_size=0.2, random_state=1, shuffle=True)
test_df = test[test.label != 'Modern Standard Arabic']

# Convert the label column to int (just to make sure)
test_df.loc[:, 'label'] = test_df['label'].astype(int)
train_df.loc[:, 'label'] = train_df['label'].astype(int)

#### Marbert Model:

In [20]:
os.environ['HF_TOKEN'] = '******************'
wandb.login(key='*************************')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc


True

##### 1. Classification Class:

In [21]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
        super(ClassificationDataset).__init__()
        self.text = text
        self.target = target
        self.tokenizer_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label_map = label_map

    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return InputFeatures(**inputs,label= self.target[item])

##### 2. Evaluation Matrix:

In [22]:
def compute_metrics(p): #p should be of type EvalPrediction
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)
    macro_f1 = f1_score(p.label_ids, preds, average='macro')
    macro_recall = recall_score(p.label_ids, preds, average='macro')
    macro_precision = precision_score(p.label_ids, preds, average='macro')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'macro_f1' : macro_f1,
        'accuracy': acc,
        'recall':macro_recall,
        'precision': macro_precision
      }

##### 3. Set model hyperparameters:

In [23]:
# model_name = "UBC-NLP/MARBERT"
hub_model_id = "MohamedAtta-AI/MARBERT-QADI"
model_name=hub_model_id

# Hyperparamters
epochs = 10
lr = 4e-6
weight_decay = 0.0001

train_batch_size = 128
eval_batch_size = 128
max_length = 200
logging_steps = 100

rank = 8
lora_scaling = 32
lora_dropout = 0.1

In [24]:
def tokenize_data(batch):
    tokenized = tokenizer(batch["text_cleaned"], padding="max_length", truncation=True, max_length=max_length, return_tensors='pt').to(device)
    return tokenized

peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=rank, lora_alpha=lora_scaling, lora_dropout=lora_dropout)

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
# model = get_peft_model(model, peft_config)
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERT")



config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/651M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

##### 4. Tokenize the data:

In [25]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = val_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/144000 [00:00<?, ? examples/s]

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3597 [00:00<?, ? examples/s]

##### 5. Start training:

In [26]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    learning_rate=lr,
    weight_decay=weight_decay,
    fp16 = True,
    output_dir=f"tuned/{model_name}hlr",
    logging_dir=f"./logs/{model_name}lhr",
    logging_steps=logging_steps,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    push_to_hub=True,
    hub_model_id=hub_model_id,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    load_best_model_at_end=True,
    do_train=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mm-tareksaad[0m ([33mattamohamedai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Recall,Precision
1,0.6543,0.78097,0.764141,0.764194,0.764147,0.767719
2,0.5631,0.792314,0.76529,0.76525,0.764941,0.767225
3,0.4926,0.810204,0.766125,0.766028,0.76566,0.767286
4,0.4121,0.841121,0.763945,0.764167,0.763841,0.765299
5,0.3652,0.878053,0.760943,0.762,0.761723,0.762059
6,0.3307,0.891931,0.761302,0.761889,0.76153,0.762187
7,0.2834,0.91372,0.762621,0.763083,0.762653,0.763252
8,0.2608,0.924688,0.764151,0.764361,0.76407,0.765018
9,0.2389,0.936109,0.762537,0.762861,0.762477,0.763123
10,0.2265,0.942971,0.762592,0.763056,0.762726,0.763001


TrainOutput(global_step=11250, training_loss=0.3779276727464464, metrics={'train_runtime': 7454.3495, 'train_samples_per_second': 193.176, 'train_steps_per_second': 1.509, 'total_flos': 1.48021229952e+17, 'train_loss': 0.3779276727464464, 'epoch': 10.0})

##### 6. Evaluate the test:

In [32]:
test_eval = trainer.evaluate(eval_dataset=test_dataset)
test_eval

{'eval_loss': 2.176445245742798,
'eval_macro_f1': 0.68112888316656492,
'eval_accuracy': 0.68209341117598,
'eval_recall': 0.6789670050761421,
'eval_precision': 0.682521469091565,
'eval_runtime': 6.0049,
'eval_samples_per_second': 599.006,
'eval_steps_per_second':4.829,
'epoch': 10.0}


##### 7. Save the model:

In [31]:
# trainer.save_model("/content/drive/MyDrive/Marbert-Arabic-Dialects")