###Install Dependencies

In [None]:
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

Load Data

In [None]:
import pandas as pd


df = pd.read_csv('https://raw.githubusercontent.com/reemalfarwan/nadeem/main/Datasets/ar_poem_metres.csv')


In [None]:
df.info()

In [None]:
#ٍShuffle the dataframe 

df = df.sample(frac=1).reset_index(drop=True)


In [None]:
df.info()

In [None]:
df.head()

Split Data

In [None]:
df_train = df[:7898]
df_dev = df[7898:9026]
df_test = df[9026:]

In [None]:
train_texts, train_labels = df_train['meter'], df_train['label']

In [None]:
dev_texts, dev_labels = df_dev['meter'], df_dev['label']

In [None]:
test_texts, test_labels = df_test['meter'], df_test['label']

In [None]:
train_texts = train_texts.tolist()
train_labels = train_labels.tolist()
print('Train Text:',type(train_texts))
print('Train Labels:',type(train_labels))

val_texts = dev_texts.tolist()
val_labels = dev_labels.tolist()
print('Dev Text:',type(val_texts))
print('Dev Labels:',type(val_labels))
 

In [None]:
test_texts = test_texts.to_list()
test_labels = test_labels.to_list()
print(type(test_texts))
print(type(test_labels))


In [None]:
import numpy as np

x = np.array(test_labels)
print(np.unique(x))
    

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]


In [None]:
#Load the tokenizer based on the pretrained language model
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizerFast


tokenizer = RobertaTokenizerFast.from_pretrained('reemalyami/AraRoBERTa_Poem', max_len=512)


In [None]:
#tokenize the text 

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class DailectDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
  
    def __len__(self):
        return len(self.labels)

train_dataset = DailectDataset(train_encodings, train_labels)
val_dataset = DailectDataset(val_encodings, val_labels)
test_dataset = DailectDataset(test_encodings, test_labels)

In [None]:
from transformers import  TrainingArguments

training_args = TrainingArguments(
    output_dir= 'Model/meter_classification',#output this line if you would like to t=run the code
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_steps=10,
)

In [None]:
#Load the pretrained language model 

from transformers import AutoModelForSequenceClassification,RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained('reemalyami/AraRoBERTa_Poem', num_labels=16)#.to("cuda")


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)#, average=macro)
    acc = accuracy_score(labels, preds)
    confusion = confusion_matrix(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall,
        'Confusion Matrix': confusion,
        'Pred': preds

    }

In [None]:
# Load the trainer class 
from transformers import Trainer


trainer = Trainer(
    model=model,   
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,           # evaluation dataset
)



In [None]:
#start training the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:

trainer.predict(test_dataset) 

In [None]:
# saving the fine tuned model

model_path = "meter_classification_model"


model.save_pretrained(model_path)
