In [None]:
!pip install transformers -U
!pip install accelerate -U
!pip install datasets

# Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import torch

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import EvalPrediction

# Data Importing

In [4]:
!gdown https://drive.google.com/uc?id=1VRvUGMz_Y0BRpzB5kUM8pR_HfuonJOjC

Downloading...
From: https://drive.google.com/uc?id=1VRvUGMz_Y0BRpzB5kUM8pR_HfuonJOjC
To: /content/Data_cleaned.csv
100% 18.3M/18.3M [00:00<00:00, 35.5MB/s]


In [5]:
data = pd.read_csv('/content/Data_cleaned.csv')
data

Unnamed: 0,text,dialect,label
0,قليلين ادب ومنافقين اختهم او قريبتهم تتعاكس تق...,LY,2
1,الليبيين متقلبين بالنسبه ليا انا ميليشياوي زما...,LY,2
2,تانيه شاب ليبي بيرتاح لبنت مختلفه ويلاحظ انها ...,LY,2
3,رانيا عقليتك متخلفه اولا الانسان يلي يحتاج اهل...,LY,2
4,شكلك متعقده علشان الراجل تحبيه ازوج بنت يتيمه ...,LY,2
...,...,...,...
147720,الناس دي بتنفخ قربه مقدوده بالدارجي كده البلد ...,SD,4
147721,انت عايش وين بره السودان شنو ماشايف البحصل دا,SD,4
147722,مااحرم نفسي ميسي حريف ولعاب برضو مدريدي وافتخر,SD,4
147723,ياخي ديل ماخلو للشيطان وابليس شي يروحو وين ربن...,SD,4


# Data preparation

In [6]:
# splitting the data into train and test sets
train_df, val_df = train_test_split(data[['text','label']], test_size=0.2)
train_df

Unnamed: 0,text,label
36111,احلام بنحببك ليبيا,2
4792,انا عشواءي اليوم معش عرفت رمايه وين,2
26881,معلومه لا تهمك عن حوشنا انا البنت الكبيره شي ا...,2
6747,حاسه كساد العيد توا,2
74397,انا شاركت في ثوره يناير وفخور بكل لحظه ومش ندم...,0
...,...,...
125589,ليكي شحار انا ايام,1
82064,وانا مالي طيب انت اللي مش مركزه,0
56212,الله اكبرر ده,0
73609,ده حبيبي امبارح وحبيب دلوقتي,0


In [7]:
# defining a pytorch dataset
class ArabicDialectDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [9]:
# tokenizer intialization
model_name = "asafaya/bert-base-arabic"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
BATCH_SIZE = 128
MAX_LEN = 128

# Create instances of the dataset
train_dataset = ArabicDialectDataset(texts=train_df.text.to_numpy(),
                                     labels=train_df.label.to_numpy(),
                                     tokenizer=tokenizer,
                                     max_len=MAX_LEN)

val_dataset = ArabicDialectDataset(texts=val_df.text.to_numpy(),
                                   labels=val_df.label.to_numpy(),
                                   tokenizer=tokenizer,
                                   max_len=MAX_LEN)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [15]:
# Define the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Function to compute metrics
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [26]:
# Training Setup
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy ='epoch',
    logging_strategy ='epoch',
    save_strategy ='epoch',
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',

)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.581,0.489574,0.829853,0.826731,0.831633,0.829853
2,0.3579,0.472287,0.840278,0.837621,0.842852,0.840278
3,0.1966,0.5491,0.843087,0.841955,0.841932,0.843087


TrainOutput(global_step=5541, training_loss=0.3785100942741467, metrics={'train_runtime': 7091.2104, 'train_samples_per_second': 49.997, 'train_steps_per_second': 0.781, 'total_flos': 2.332147655729664e+16, 'train_loss': 0.3785100942741467, 'epoch': 3.0})

In [35]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 0.5491003394126892,
 'eval_accuracy': 0.8430868167202572,
 'eval_f1': 0.8419545191637657,
 'eval_precision': 0.8419321036698405,
 'eval_recall': 0.8430868167202572,
 'eval_runtime': 194.0642,
 'eval_samples_per_second': 152.243,
 'eval_steps_per_second': 2.381,
 'epoch': 3.0}