# Bert experimenty

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import tensorflow as tf
from torch.utils.data import Dataset
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
from typing import Dict, List
import transformers
from transformers import AutoModel, BertTokenizerFast,BertTokenizer, DistilBertTokenizerFast
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM

In [None]:
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install transformers==4.30

Collecting accelerate>=0.21.0 (from transformers[torch])
  Using cached accelerate-0.29.3-py3-none-any.whl (297 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->transformers[t

## Načítanie a príprava dát

In [None]:
dataset = pd.read_csv('../Data/final_dataset_2_balanced.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset['processed_text'], dataset['author_id'], test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_valid = label_encoder.transform(y_valid)
y_test = label_encoder.transform(y_test)

## Bert

In [None]:
class CustomDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
x_train = list(X_train)
x_val = list(X_valid)
x_test = list(X_test)

y_train = list(y_train)
y_val = list(y_valid)
y_test = list(y_test)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(x_train, truncation=True, padding=True)
val_encodings = tokenizer(x_val,truncation=True, padding=True)
test_encodings = tokenizer(x_test, truncation=True, padding=True)

In [None]:
train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_valid)
test_dataset = CustomDataset(test_encodings, y_test)

In [None]:
labels_count = len(dataset['author_id'].unique())
print(labels_count)

20


## Experiment 1 - learning_rate = 5e-5

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=12,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=300,
    weight_decay=0.01,
    learning_rate=5e-5,             #default
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=labels_count)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.971659,0.083333,0.07072,0.125758,0.064096
2,No log,2.829183,0.104167,0.017833,0.098864,0.027722
3,No log,2.382121,0.427083,0.393494,0.447392,0.385307
4,No log,2.07313,0.479167,0.521919,0.539291,0.437111
5,No log,1.472418,0.645833,0.739881,0.689767,0.673403
6,No log,1.12688,0.677083,0.728214,0.735184,0.693419
7,No log,0.84335,0.78125,0.8171,0.791147,0.778077
8,No log,0.826915,0.791667,0.811905,0.809481,0.784644
9,No log,0.956209,0.760417,0.77583,0.785671,0.764191
10,1.486100,0.884092,0.760417,0.743647,0.776981,0.743997


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=648, training_loss=1.1537413957678242, metrics={'train_runtime': 484.4315, 'train_samples_per_second': 21.353, 'train_steps_per_second': 1.338, 'total_flos': 2722060610666496.0, 'train_loss': 1.1537413957678242, 'epoch': 12.0})

In [None]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-0.7402344 ,  0.5058594 ,  6.9765625 , ...,  0.13659668,
         0.49658203, -0.4663086 ],
       [ 0.7348633 ,  0.47436523, -1.5703125 , ..., -1.4023438 ,
        -0.9394531 , -0.4873047 ],
       [-1.1621094 , -0.6645508 ,  0.8310547 , ..., -0.85595703,
         0.6196289 , -0.6557617 ],
       ...,
       [ 4.5820312 ,  0.171875  , -0.47680664, ..., -1.1650391 ,
        -0.8925781 , -0.66308594],
       [-0.5620117 ,  1.6005859 ,  6.9570312 , ...,  0.31713867,
         0.0333252 , -0.7294922 ],
       [-0.06744385, -0.6953125 , -1.1855469 , ..., -1.1894531 ,
        -0.6308594 ,  0.1104126 ]], dtype=float32), label_ids=array([ 2,  5, 16,  3,  4,  1, 19, 17, 13, 11,  1,  4,  4, 11,  9, 10, 12,
        8,  1,  3, 13,  7,  6, 19, 16,  9,  6,  0, 19,  7, 13, 14,  6,  8,
       13,  0,  3, 16, 19,  7, 17, 16,  5, 19, 11, 19, 19,  4,  4,  9,  8,
       14, 13, 10, 11, 19,  5,  5, 18, 13, 19, 10, 15, 16, 10,  8,  4,  3,
        0, 12,  3, 10,  2,  4, 1

## Experiment 4 - learning_rate = 8e-5

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=12,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=300,
    weight_decay=0.01,
    learning_rate=8e-5,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=labels_count)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.974833,0.0625,0.060498,0.071688,0.030519
2,No log,2.741608,0.25,0.267004,0.304643,0.224802
3,No log,2.038671,0.4375,0.51471,0.482186,0.377606
4,No log,1.573408,0.552083,0.698564,0.609508,0.549903
5,No log,1.098896,0.65625,0.731484,0.685801,0.67336
6,No log,0.859665,0.75,0.801349,0.776245,0.753384
7,No log,0.785474,0.770833,0.771742,0.772457,0.758418
8,No log,0.834776,0.78125,0.784919,0.794085,0.77406
9,No log,0.815626,0.822917,0.824167,0.829048,0.809945
10,1.286400,0.819784,0.84375,0.8725,0.841645,0.841146


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=648, training_loss=0.995923587569484, metrics={'train_runtime': 490.951, 'train_samples_per_second': 21.069, 'train_steps_per_second': 1.32, 'total_flos': 2722060610666496.0, 'train_loss': 0.995923587569484, 'epoch': 12.0})

In [None]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[ 0.09423828,  3.1914062 ,  7.046875  , ...,  0.01707458,
        -0.9189453 , -1.0449219 ],
       [-0.2253418 ,  0.41088867, -1.8085938 , ..., -1.4345703 ,
         0.47558594, -0.31274414],
       [ 0.42822266, -1.8330078 ,  0.93603516, ..., -0.54248047,
         0.7788086 , -1.0654297 ],
       ...,
       [ 4.5976562 ,  0.2397461 , -1.2568359 , ..., -1.2539062 ,
         0.9848633 , -0.96777344],
       [ 0.90234375,  7.1875    ,  3.0742188 , ...,  0.05117798,
        -1.5283203 , -1.2285156 ],
       [-0.19030762, -0.2565918 , -0.8857422 , ..., -0.25976562,
         0.23034668, -0.16296387]], dtype=float32), label_ids=array([ 2,  5, 16,  3,  4,  1, 19, 17, 13, 11,  1,  4,  4, 11,  9, 10, 12,
        8,  1,  3, 13,  7,  6, 19, 16,  9,  6,  0, 19,  7, 13, 14,  6,  8,
       13,  0,  3, 16, 19,  7, 17, 16,  5, 19, 11, 19, 19,  4,  4,  9,  8,
       14, 13, 10, 11, 19,  5,  5, 18, 13, 19, 10, 15, 16, 10,  8,  4,  3,
        0, 12,  3, 10,  2,  4, 1