In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/TOUNSI

/content/drive/MyDrive/TOUNSI


## Imports + Reading data :

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
t_hsab = pd.read_excel("Data/T-HSAB/T-HSAB.xlsx", header=None)
t_hsab.columns = ['Text', 'Label']

In [None]:
translated = pd.read_csv("Data/transalted.csv", index_col=0)
translated.columns = ['Text', 'Label','Text_arabic']

In [None]:
t_hsab

Unnamed: 0,Text,Label
0,اسغي ياشعب تونس تدعوا بالاسلام كفار الحمدلله ن...,hate
1,قطع يد السارق توفرت الشروط شرط الحد الأدنى قيم...,normal
2,تلوموش لطفي لعبدلي شرف,normal
3,مستغرب شعب يسمع تفاهة شانو لى الدرجة الشعب تاف...,normal
4,هههخ غزلتني مافهمتش شمدخلها الموضوع تتنطر وحده...,normal
...,...,...
6019,رحماك رب رحماك رب التوانسة ولات تناقش القرأن ت...,hate
6020,إنسان تافه وكلام فارغ تفوووو كلب,abusive
6021,مريم معجبييك مي تحتي عيني قناة عادة مكروها ونط...,normal
6022,نكره امها,hate


In [None]:
print(t_hsab.isnull().sum())

Text     0
Label    0
dtype: int64


In [None]:
t_hsab.value_counts('Label')

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
normal,3820
abusive,1126
hate,1078


In [None]:
translated

Unnamed: 0,Text,Label,Text_arabic
0,Fuck La Tunisie Mouch blédi nikha b Fasma Sous...,abusive,فاك ال تونسية مش بليدي نكها ب فاصمة سوسة مش عا...
1,ken hedha mouch rap enti fech tbi3 lzebi :p,abusive,كن هذا مش راب إنت فش تبيع لزبي :p
2,fuck you chnawa teswa 7a9 plazma :p,abusive,فاك يو شنوا تسوى حق بلازما :p
3,chouf chanteur yghani l'amour lweld l9ahba fik...,abusive,
4,3omrek matjib 7jar l3arka fiha kartouch ( chko...,normal,عمرك ماتجيب حجر لعركة فيها كرطوش ( شكون حافتها )
...,...,...,...
1095,ija rdha3li papichi 5ali l'omek rachfa tassma3...,abusive,اجة رذعلي ببيش خلي لأمك رشفة تسمع ل نكتك إتكرز...
1096,ايجا ارضعلي papishi خلي لامك رشفة تسمع الي نكت...,abusive,ايجا ارضعلي ببيش خلي لامك رشفة تسمع الي نكتك ت...
1097,امك نعشقها مطرشقها شحال لعبت بالعصفور سقصي على...,hate,امك نعشقها مطرشقها شحال لعبت بالعصفور سقصي على...
1098,se7liiiiiiiiiii\nbellehi bara nik ommmmeekkkkkk,hate,se7liiiiiiiiiii\nbellehi برى نيك أمميككك


In [None]:
print(translated.isnull().sum())

Text           0
Label          0
Text_arabic    0
dtype: int64


In [None]:
translated.value_counts('Label')

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
normal,870
abusive,133
hate,97


## Preprocess :

In [None]:
import re
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
arabic_stopwords = set(stopwords.words('arabic'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Step 1: Normalize Arabic Text
def normalize_arabic(text):
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    return text

In [None]:
# Step 2: Remove Diacritics
def remove_diacritics(text):
    arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    return re.sub(arabic_diacritics, '', text)

In [None]:
# Step 3: Remove Stop Words
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in arabic_stopwords]
    return ' '.join(filtered_words)

In [None]:
# Step 4: Clean Text
def clean_text(text):
    # Remove non-Arabic/Latin characters, punctuation, numbers, and Latin letters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[A-Za-z]', '', text) # Remove Latin letters
    return text

In [None]:
# Full Preprocessing Pipeline
def preprocess_text(text):
    text = normalize_arabic(text)
    text = remove_diacritics(text)
    text = remove_stop_words(text)
    text = clean_text(text)
    return text

In [None]:
translated['Preprocessed_Text'] = translated['Text_arabic'].apply(preprocess_text)

In [None]:
translated

Unnamed: 0,Text,Label,Text_arabic,Preprocessed_Text
0,Fuck La Tunisie Mouch blédi nikha b Fasma Sous...,abusive,فاك ال تونسية مش بليدي نكها ب فاصمة سوسة مش عا...,فاك ال تونسيه مش بليدي نكها فاصمه سوسه مش عاصم...
1,ken hedha mouch rap enti fech tbi3 lzebi :p,abusive,كن هذا مش راب إنت فش تبيع لزبي :p,مش راب انت فش تبيع لزبي
2,fuck you chnawa teswa 7a9 plazma :p,abusive,فاك يو شنوا تسوى حق بلازما :p,فاك يو شنوا تسوي حق بلازما
3,chouf chanteur yghani l'amour lweld l9ahba fik...,abusive,,
4,3omrek matjib 7jar l3arka fiha kartouch ( chko...,normal,عمرك ماتجيب حجر لعركة فيها كرطوش ( شكون حافتها ),عمرك ماتجيب حجر لعركه كرطوش شكون حافتها
...,...,...,...,...
1095,ija rdha3li papichi 5ali l'omek rachfa tassma3...,abusive,اجة رذعلي ببيش خلي لأمك رشفة تسمع ل نكتك إتكرز...,اجه رذعلي ببيش خلي لامك رشفه تسمع نكتك اتكرز ف...
1096,ايجا ارضعلي papishi خلي لامك رشفة تسمع الي نكت...,abusive,ايجا ارضعلي ببيش خلي لامك رشفة تسمع الي نكتك ت...,ايجا ارضعلي ببيش خلي لامك رشفه تسمع الي نكتك ت...
1097,امك نعشقها مطرشقها شحال لعبت بالعصفور سقصي على...,hate,امك نعشقها مطرشقها شحال لعبت بالعصفور سقصي على...,امك نعشقها مطرشقها شحال لعبت بالعصفور سقصي علي...
1098,se7liiiiiiiiiii\nbellehi bara nik ommmmeekkkkkk,hate,se7liiiiiiiiiii\nbellehi برى نيك أمميككك,بري نيك امميككك


## Combining data

In [None]:
translated_copy = translated[['Preprocessed_Text', 'Label']].copy()
translated_copy.rename(columns={'Preprocessed_Text': 'Text'}, inplace=True)

In [None]:
t_hsab_copy = t_hsab.copy()

In [None]:
# Separate classes in the datasets (using the copies)
translated_normal = translated_copy[translated_copy['Label'] == 'normal']
translated_abusive = translated_copy[translated_copy['Label'] == 'abusive']
translated_hate = translated_copy[translated_copy['Label'] == 'hate']

t_hsab_normal = t_hsab_copy[t_hsab_copy['Label'] == 'normal']
t_hsab_abusive = t_hsab_copy[t_hsab_copy['Label'] == 'abusive']
t_hsab_hate = t_hsab_copy[t_hsab_copy['Label'] == 'hate']

# Counts for balance
num_abusive = len(translated_abusive) + len(t_hsab_abusive)
num_hate = len(translated_hate) + len(t_hsab_hate)
target_normal_count = max(num_abusive, num_hate)

# Select necessary 'normal' samples from t_hsab_copy to balance the dataset
# Add all translated_copy 'normal' and sufficient 'normal' samples from t_hsab_copy
combined_normal = pd.concat([translated_normal, t_hsab_normal.sample(target_normal_count - len(translated_normal), random_state=42)])

# Combine all classes for the balanced dataset
balanced_dataset = pd.concat([combined_normal, translated_abusive, t_hsab_abusive, translated_hate, t_hsab_hate])

# Shuffle the dataset to mix the classes
balanced_dataset = balanced_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
print(balanced_dataset['Label'].value_counts())

Label
normal     1259
abusive    1259
hate       1175
Name: count, dtype: int64


In [None]:
# Save or display the balanced dataset
# balanced_dataset.to_csv('Data/balanced_dataset.csv', index=False)

## Modeling :

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="tunis-ai/TunBERT", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

config_tunbert.py:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tunis-ai/TunBERT:
- config_tunbert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_tunbert.py:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tunis-ai/TunBERT:
- modeling_tunbert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("tunis-ai/TunBERT", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("tunis-ai/TunBERT", trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
from tqdm import tqdm

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in tqdm(data_loader, desc="Training", unit="batch"):
        optimizer.zero_grad()  # Reset the gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()  # Compute gradients
        optimizer.step()  # Update parameters
        scheduler.step()  # Update learning rate schedule


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad(): # Disable Gradient Tracking
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def load_data_labeled(df) :
    texts = df['Text'].tolist()
    labels = [1 if label == 'hate' else 2 if label == 'abusive' else 0 for label in df['Label'].tolist()]
    return texts, labels


In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

    return "hate" if preds.item() == 1 else "abusive" if preds.item() == 2 else "normal"


In [None]:
texts,labels = load_data_labeled(balanced_dataset)

In [None]:
print(len(texts))
print(len(labels))

3693
3693


In [None]:
import numpy as np

# Calculate token lengths for each text
text_lengths = [len(tokenizer.encode(text)) for text in texts]

# Calculate mean, median, and standard deviation
max_length = max(text_lengths)
mean_length = np.mean(text_lengths)
median_length = np.median(text_lengths)
std_length = np.std(text_lengths)

print(f"Max length: {max_length}")
print(f"Mean length: {mean_length}")
print(f"Median length: {median_length}")
print(f"Standard Deviation: {std_length}")

Max length: 1137
Mean length: 50.83482263742215
Median length: 30.0
Standard Deviation: 65.38448273613784


In [None]:
num_classes = 3
max_length = 256
batch_size = 32
num_epochs = 20
learning_rate = 1e-5

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
model = BERTClassifier(bert_model_name="tunis-ai/TunBERT", num_classes=3).to(device) # move model to device

Some weights of BertModel were not initialized from the model checkpoint at tunis-ai/TunBERT and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

In [None]:
# optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer = AdamW(model.parameters(),lr=learning_rate,weight_decay=0.01)  # Adding weight decay



In [None]:
total_steps = len(train_dataloader) * num_epochs

In [None]:
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
warmup_steps = int(0.1 * total_steps)  # 10% of total steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,  # Set warmup steps
    num_training_steps=total_steps  # Total training steps
)

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/20


Training: 100%|██████████| 93/93 [02:00<00:00,  1.29s/batch]


Validation Accuracy: 0.4777
              precision    recall  f1-score   support

           0       0.66      0.60      0.63       277
           1       0.38      0.89      0.53       207
           2       1.00      0.02      0.03       255

    accuracy                           0.48       739
   macro avg       0.68      0.50      0.40       739
weighted avg       0.70      0.48      0.39       739

Epoch 2/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.5643
              precision    recall  f1-score   support

           0       0.58      0.80      0.67       277
           1       0.50      0.55      0.52       207
           2       0.65      0.32      0.43       255

    accuracy                           0.56       739
   macro avg       0.58      0.56      0.54       739
weighted avg       0.58      0.56      0.55       739

Epoch 3/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6089
              precision    recall  f1-score   support

           0       0.78      0.60      0.68       277
           1       0.50      0.51      0.50       207
           2       0.56      0.70      0.62       255

    accuracy                           0.61       739
   macro avg       0.62      0.60      0.60       739
weighted avg       0.63      0.61      0.61       739

Epoch 4/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.5765
              precision    recall  f1-score   support

           0       0.76      0.60      0.67       277
           1       0.43      0.83      0.57       207
           2       0.72      0.34      0.46       255

    accuracy                           0.58       739
   macro avg       0.64      0.59      0.57       739
weighted avg       0.65      0.58      0.57       739

Epoch 5/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6157
              precision    recall  f1-score   support

           0       0.81      0.64      0.72       277
           1       0.46      0.78      0.58       207
           2       0.70      0.45      0.55       255

    accuracy                           0.62       739
   macro avg       0.66      0.63      0.61       739
weighted avg       0.67      0.62      0.62       739

Epoch 6/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.29s/batch]


Validation Accuracy: 0.6184
              precision    recall  f1-score   support

           0       0.67      0.74      0.70       277
           1       0.53      0.55      0.54       207
           2       0.63      0.54      0.58       255

    accuracy                           0.62       739
   macro avg       0.61      0.61      0.61       739
weighted avg       0.62      0.62      0.62       739

Epoch 7/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6346
              precision    recall  f1-score   support

           0       0.78      0.69      0.73       277
           1       0.54      0.48      0.51       207
           2       0.58      0.70      0.64       255

    accuracy                           0.63       739
   macro avg       0.63      0.62      0.62       739
weighted avg       0.64      0.63      0.63       739

Epoch 8/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6373
              precision    recall  f1-score   support

           0       0.77      0.69      0.73       277
           1       0.65      0.34      0.45       207
           2       0.55      0.82      0.66       255

    accuracy                           0.64       739
   macro avg       0.66      0.62      0.61       739
weighted avg       0.66      0.64      0.63       739

Epoch 9/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6292
              precision    recall  f1-score   support

           0       0.71      0.72      0.72       277
           1       0.53      0.58      0.56       207
           2       0.62      0.56      0.59       255

    accuracy                           0.63       739
   macro avg       0.62      0.62      0.62       739
weighted avg       0.63      0.63      0.63       739

Epoch 10/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6319
              precision    recall  f1-score   support

           0       0.77      0.69      0.72       277
           1       0.65      0.35      0.46       207
           2       0.54      0.80      0.64       255

    accuracy                           0.63       739
   macro avg       0.65      0.61      0.61       739
weighted avg       0.65      0.63      0.62       739

Epoch 11/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6509
              precision    recall  f1-score   support

           0       0.82      0.65      0.73       277
           1       0.53      0.66      0.59       207
           2       0.63      0.65      0.64       255

    accuracy                           0.65       739
   macro avg       0.66      0.65      0.65       739
weighted avg       0.67      0.65      0.66       739

Epoch 12/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6428
              precision    recall  f1-score   support

           0       0.84      0.61      0.71       277
           1       0.54      0.59      0.56       207
           2       0.59      0.72      0.65       255

    accuracy                           0.64       739
   macro avg       0.66      0.64      0.64       739
weighted avg       0.67      0.64      0.65       739

Epoch 13/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6441
              precision    recall  f1-score   support

           0       0.78      0.68      0.73       277
           1       0.54      0.54      0.54       207
           2       0.60      0.69      0.64       255

    accuracy                           0.64       739
   macro avg       0.64      0.64      0.64       739
weighted avg       0.65      0.64      0.65       739

Epoch 14/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6428
              precision    recall  f1-score   support

           0       0.76      0.70      0.73       277
           1       0.54      0.54      0.54       207
           2       0.61      0.66      0.64       255

    accuracy                           0.64       739
   macro avg       0.64      0.63      0.64       739
weighted avg       0.65      0.64      0.64       739

Epoch 15/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6631
              precision    recall  f1-score   support

           0       0.80      0.69      0.74       277
           1       0.56      0.61      0.58       207
           2       0.63      0.67      0.65       255

    accuracy                           0.66       739
   macro avg       0.66      0.66      0.66       739
weighted avg       0.67      0.66      0.67       739

Epoch 16/20


Training: 100%|██████████| 93/93 [01:58<00:00,  1.28s/batch]


Validation Accuracy: 0.6441
              precision    recall  f1-score   support

           0       0.80      0.68      0.73       277
           1       0.51      0.75      0.61       207
           2       0.66      0.53      0.59       255

    accuracy                           0.64       739
   macro avg       0.66      0.65      0.64       739
weighted avg       0.67      0.64      0.65       739

Epoch 17/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6536
              precision    recall  f1-score   support

           0       0.83      0.64      0.72       277
           1       0.54      0.63      0.58       207
           2       0.62      0.69      0.66       255

    accuracy                           0.65       739
   macro avg       0.66      0.65      0.65       739
weighted avg       0.68      0.65      0.66       739

Epoch 18/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6401
              precision    recall  f1-score   support

           0       0.83      0.61      0.70       277
           1       0.54      0.58      0.56       207
           2       0.59      0.72      0.65       255

    accuracy                           0.64       739
   macro avg       0.65      0.64      0.64       739
weighted avg       0.66      0.64      0.64       739

Epoch 19/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6482
              precision    recall  f1-score   support

           0       0.81      0.65      0.72       277
           1       0.52      0.69      0.59       207
           2       0.64      0.62      0.63       255

    accuracy                           0.65       739
   macro avg       0.66      0.65      0.65       739
weighted avg       0.67      0.65      0.65       739

Epoch 20/20


Training: 100%|██████████| 93/93 [01:59<00:00,  1.28s/batch]


Validation Accuracy: 0.6549
              precision    recall  f1-score   support

           0       0.80      0.67      0.73       277
           1       0.54      0.63      0.58       207
           2       0.63      0.66      0.64       255

    accuracy                           0.65       739
   macro avg       0.66      0.65      0.65       739
weighted avg       0.67      0.65      0.66       739



In [None]:
ls

[0m[01;34mCode[0m/  convert_best  [01;34mData[0m/  Links.gdoc  [01;34mModels[0m/  [01;34mPapers[0m/


In [None]:
# torch.save(model.state_dict(), "Models/TunBert_2/bert_model.pth")

In [None]:
# tokenizer.save_pretrained("Models/TunBert_2/")

('Models/TunBert_2/tokenizer_config.json',
 'Models/TunBert_2/special_tokens_map.json',
 'Models/TunBert_2/vocab.txt',
 'Models/TunBert_2/added_tokens.json',
 'Models/TunBert_2/tokenizer.json')

In [None]:
# model = BERTClassifier(bert_model_name="tunis-ai/TunBERT", num_classes=3)
# model.load_state_dict(torch.load("Models/bert_model.pth"))
# model.to(device)