In [1]:
import numpy as np
import random
import torch
from torch import nn
import transformers
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from rich import print
import pandas as pd
from datasets import Dataset, Value, ClassLabel, Features, DatasetDict, load_dataset, load_from_disk

2024-08-11 01:23:04.941824: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-11 01:23:04.941915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-11 01:23:04.943333: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-11 01:23:04.952369: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_path = '../../data/result/final_suggestions_augmented_5_examples_hf_balanced1/'
data = load_from_disk(data_path)

In [3]:
print(f"Training Example:\n{data['train']['text'][1]}")

In [4]:
test_path = '../../data/test_gold_labels/test_with_suggestions_df_majority_vote_gpt4o_preferred.csv'

In [5]:
test_df = pd.read_csv(test_path)

In [6]:
test_df.head()

Unnamed: 0,text,updated_labels
0,اقتراحاتي للإضافة: everything is spectacular a...,none
1,اقتراحاتي للإضافة: thank you for everything,none
2,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,needs to be added
3,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,needs to be added
4,اقتراحاتي للإضافة: change the contents all,needs to be added


In [7]:
test_df = test_df.rename(columns={'updated_labels': 'labels'})

In [8]:
test_df['labels'].unique()

array(['none', 'needs to be added', 'needs to be removed',
       'needs enhancements'], dtype=object)

In [9]:
test_df.isna().sum()

text      0
labels    0
dtype: int64

In [10]:
test_df = test_df.fillna('none')

In [11]:
test_df['labels'].unique()

array(['none', 'needs to be added', 'needs to be removed',
       'needs enhancements'], dtype=object)

In [12]:
test_df['labels'].value_counts()

needs to be added      110
needs enhancements      55
none                    40
needs to be removed     14
Name: labels, dtype: int64

In [13]:
train_df = data['train'].to_pandas()

In [14]:
train_df['labels'].apply(data['train'].features['labels'].int2str).value_counts()

none                   174
needs enhancements     174
needs to be added      174
needs to be removed     84
Name: labels, dtype: int64

In [15]:
label_names = sorted(data['train'].features['labels'].names)

features = Features(
    {
        'id': Value(dtype='string'),
        'text': Value(dtype='string'),
        'labels': ClassLabel(names=label_names, num_classes=len(label_names), id=None)
    }
)
#################################################################################
test_hf = Dataset.from_dict(
        {
            'id': list(test_df.index.astype(str)),
            'text': list(test_df['text']),
            'labels': list(test_df['labels'])
        },
        features=features
    )
#################################################################################
random_state = 42
temp_data = test_hf.train_test_split(test_size=0.5, seed=random_state, stratify_by_column='labels')

In [16]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 606
    })
})

In [17]:
print('Train Data Length:', len(data['train']))

In [18]:
data['validation'] = temp_data['train']
data['test'] = temp_data['test']

In [19]:
print(data)

In [20]:
print(data['train'].features['labels'].int2str(list(range(len(label_names)))))
print(data['test'].features['labels'].int2str(list(range(len(label_names)))))
print(data['validation'].features['labels'].int2str(list(range(len(label_names)))))

In [21]:
model_checkpoint = 'FacebookAI/xlm-roberta-large'

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
label_feature = data['train'].features['labels']
label_names = label_feature.names
print(label_names)

In [24]:
label2id = {label: label_feature.str2int(label) for label in label_names}
id2label = {v: k for k, v in label2id.items()}

In [25]:
print(label2id)
print(id2label)

In [26]:
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

In [27]:
tokenized_dataset = data.map(tokenize_function, batched=True, remove_columns=['text'])

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [28]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 606
    })
    validation: Dataset({
        features: ['id', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 109
    })
    test: Dataset({
        features: ['id', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 110
    })
})

In [29]:
tokenized_train = tokenized_dataset['train']
tokenized_validation = tokenized_dataset['validation']
tokenized_test = tokenized_dataset['test']

In [30]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        id2label=id2label,
        label2id=label2id,
    )

    return model

In [31]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    result = classification_report(labels, predictions, output_dict=True, zero_division=0)
    # print(result)
    metrics = {f'{id2label[int(k)]} f1-score': v['f1-score'] for k, v in result.items() if k.isdigit()}
    metrics['Accuracy'] = result['accuracy']
    metrics['Macro f1-score'] = result['macro avg']['f1-score']
    return metrics

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [33]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        y = self.train_dataset['labels']
        class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=np.array(y))
        class_weights = torch.tensor(class_weights).float().to(device)
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [34]:
batch_size = 8
num_epochs = 5

In [35]:
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed(random_state)
transformers.set_seed(random_state)


torch.cuda.manual_seed_all(random_state)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [36]:
args = TrainingArguments(
    f'../../models/model_suggestions',
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    seed=random_state,
    data_seed=random_state,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-5,
    weight_decay=0.001,
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to='none'
)

trainer = CustomTrainer(
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    data_collator=data_collator,
    tokenizer=tokenizer,
    model_init=model_init,
    compute_metrics=compute_metrics
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Needs enhancements f1-score,Needs to be added f1-score,Needs to be removed f1-score,None f1-score,Accuracy,Macro f1-score
1,No log,1.304071,0.457143,0.534884,0.0,0.0,0.431193,0.248007
2,No log,0.958935,0.701754,0.714286,0.173913,0.75,0.66055,0.584988
3,No log,0.731573,0.714286,0.819672,0.25,0.75,0.761468,0.633489
4,No log,0.668232,0.727273,0.828829,0.307692,0.871795,0.779817,0.683897
5,No log,0.652301,0.68,0.842975,0.4,0.864865,0.788991,0.69696


In [None]:
predictions = trainer.predict(tokenized_test)

In [None]:
true_labels, preds = predictions.label_ids, predictions.predictions
pred_labels = np.argmax(preds, axis=-1)

print(classification_report(true_labels, pred_labels, target_names=label_names))

In [None]:
cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
disp.plot(cmap="Blues", colorbar=True, xticks_rotation='vertical')

In [None]:
# xlm roberta large, 1e-5, decay 0.001, no warm up, duplicate data=False, custom trainer

{
    'test_loss': 0.5525250434875488,
    'test_needs enhancements f1-score': 0.75,
    'test_needs to be added f1-score': 0.8571428571428571,
    'test_needs to be removed f1-score': 0.7692307692307692,
    'test_none f1-score': 0.8717948717948718,
    'test_Accuracy': 0.8272727272727273,
    'test_Macro f1-score': 0.8120421245421245,
    'test_runtime': 0.7333,
    'test_samples_per_second': 150.011,
    'test_steps_per_second': 19.092
}


# xlm roberta base, 2e-5, decay 0.0, no warm up, duplicate data=true, custom trainer
{
    'test_loss': 0.6510756015777588,
    'test_needs enhancements f1-score': 0.75,
    'test_needs to be added f1-score': 0.8135593220338982,
    'test_needs to be removed f1-score': 0.7692307692307692,
    'test_none f1-score': 0.7272727272727274,
    'test_Accuracy': 0.7818181818181819,
    'test_Macro f1-score': 0.7650157046343488,
    'test_runtime': 0.3443,
    'test_samples_per_second': 319.471,
    'test_steps_per_second': 40.66
}

print()

In [None]:
{
    'test_loss': 1.2701101303100586,
    'test_needs enhancements f1-score': 0.4675324675324675,
    'test_needs to be added f1-score': 0.4897959183673469,
    'test_needs to be removed f1-score': 0.0,
    'test_none f1-score': 0.6842105263157894,
    'test_Accuracy': 0.5,
    'test_Macro f1-score': 0.41038472805390097,
    'test_runtime': 0.4405,
    'test_samples_per_second': 249.709,
    'test_steps_per_second': 31.781
}

# performance on test data annotated with llms, train data with augmentation {target_length: 86}, with custom trainer

{
    'test_loss': 0.7613580822944641,
    'test_needs enhancements f1-score': 0.7333333333333333,
    'test_needs to be added f1-score': 0.8571428571428572,
    'test_needs to be removed f1-score': 0.5,
    'test_none f1-score': 0.6666666666666667,
    'test_Accuracy': 0.7818181818181819,
    'test_Macro f1-score': 0.6892857142857143,
    'test_runtime': 0.4128,
    'test_samples_per_second': 266.481,
    'test_steps_per_second': 33.916
}

# performance on test data annotated with llms, train data with augmentation {target_length: 86}, with trainer

{
    'test_loss': 0.6633874177932739,
    'test_needs enhancements f1-score': 0.7213114754098361,
    'test_needs to be added f1-score': 0.8666666666666667,
    'test_needs to be removed f1-score': 0.25,
    'test_none f1-score': 0.7741935483870968,
    'test_Accuracy': 0.7909090909090909,
    'test_Macro f1-score': 0.6530429226158998,
    'test_runtime': 0.3959,
    'test_samples_per_second': 277.836,
    'test_steps_per_second': 35.361
}


# performance on test data annotated with llms, train data with augmentation {target_length: 200}, with custom trainer

{
    'test_loss': 0.7026541233062744,
    'test_needs enhancements f1-score': 0.7586206896551724,
    'test_needs to be added f1-score': 0.8888888888888888,
    'test_needs to be removed f1-score': 0.4444444444444444,
    'test_none f1-score': 0.8333333333333333,
    'test_Accuracy': 0.8272727272727273,
    'test_Macro f1-score': 0.7313218390804597,
    'test_runtime': 0.4192,
    'test_samples_per_second': 262.416,
    'test_steps_per_second': 33.398
}

# performance on test data annotated with llms, train data with augmentation {target_length: 200}, with trainer

{
    'test_loss': 0.7282879948616028,
    'test_needs enhancements f1-score': 0.6909090909090909,
    'test_needs to be added f1-score': 0.8503937007874016,
    'test_needs to be removed f1-score': 0.2222222222222222,
    'test_none f1-score': 0.6896551724137931,
    'test_Accuracy': 0.7636363636363637,
    'test_Macro f1-score': 0.613295046583127,
    'test_runtime': 0.3994,
    'test_samples_per_second': 275.388,
    'test_steps_per_second': 35.049
}


# performance on test data annotated with llms, train data with augmentation {target_length: 174, duplicate: True}, with trainer or custom trainer
{
    'test_loss': 0.6391478180885315,
    'test_needs enhancements f1-score': 0.7272727272727273,
    'test_needs to be added f1-score': 0.8695652173913043,
    'test_needs to be removed f1-score': 0.5,
    'test_none f1-score': 0.7894736842105262,
    'test_Accuracy': 0.8,
    'test_Macro f1-score': 0.7215779072186395,
    'test_runtime': 0.4117,
    'test_samples_per_second': 267.168,
    'test_steps_per_second': 34.003
}


# performance on test data annotated with llms, train data with augmentation {target_length: 300, duplicate: True}, with custom trainer
{
    'test_loss': 0.6742064952850342,
    'test_needs enhancements f1-score': 0.7169811320754716,
    'test_needs to be added f1-score': 0.8717948717948718,
    'test_needs to be removed f1-score': 0.6153846153846153,
    'test_none f1-score': 0.7567567567567567,
    'test_Accuracy': 0.8,
    'test_Macro f1-score': 0.7402293440029288,
    'test_runtime': 0.4105,
    'test_samples_per_second': 267.978,
    'test_steps_per_second': 34.106
}


# performance on test data annotated with llms and human, train data with augmentation {target_length: 300, duplicate: True}, with custom trainer

{
    'test_loss': 0.6602676510810852,
    'test_needs enhancements f1-score': 0.7272727272727273,
    'test_needs to be added f1-score': 0.8108108108108109,
    'test_needs to be removed f1-score': 0.5714285714285714,
    'test_none f1-score': 0.8000000000000002,
    'test_Accuracy': 0.7727272727272727,
    'test_Macro f1-score': 0.7273780273780274,
    'test_runtime': 0.399,
    'test_samples_per_second': 275.711,
    'test_steps_per_second': 35.09
}

# performance on test data annotated with llms and human, train data with augmentation {target_length: 300, duplicate: True}, with custom trainer
# FacebookAI/xlm-roberta-base
{
    'test_loss': 0.5830352306365967,
    'test_needs enhancements f1-score': 0.7169811320754718,
    'test_needs to be added f1-score': 0.8141592920353982,
    'test_needs to be removed f1-score': 0.6666666666666666,
    'test_none f1-score': 0.8717948717948718,
    'test_Accuracy': 0.7909090909090909,
    'test_Macro f1-score': 0.7674004906431021,
    'test_runtime': 0.3282,
    'test_samples_per_second': 335.137,
    'test_steps_per_second': 42.654
}

print()