In [None]:
pip install transformers==4.3.0



## 1. Data loader 

In [None]:
import pandas as pd
import numpy as np

DATA = 'drive/MyDrive/CODE/SentProd/dataset/Data_sent.xlsx'

data = pd.read_excel(DATA)

X = data['Cmt'].values
y = data['sentiment'].values

# 2.Multilingual transformers

In [None]:
# XLM-R
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import torch

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

results = []
results2 = []
results3 = []
results4 = []

confuses = []
fold = 0

model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels = 6)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base",use_fast=False)

kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    # Keep the test set ratio
    X_test_fold = X[test]
    y_test_fold = y[test]

    train_encodings = tokenizer(X_train_fold.tolist(), truncation=True, padding=True, max_length=50)
    test_encodings = tokenizer(X_test_fold.tolist(), truncation=True, padding=True, max_length=50)

    train_dataset = BuildDataset(train_encodings, y_train_fold.tolist())
    test_dataset = BuildDataset(test_encodings, y_test_fold.tolist())

    training_args = TrainingArguments(
        output_dir="drive/MyDrive/CODE/SentProd/model/xlm-r-fold{}/".format(fold),          
        num_train_epochs=3,              
        per_device_train_batch_size=16,  
        per_device_eval_batch_size=16,   
        warmup_steps=500,                
        weight_decay=0.01,
        no_cuda=False,
        do_eval=False
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,                  
    )

    trainer.train()
    trainer.save_model("drive/MyDrive/CODE/SentProd/model/xlm-r-fold{}/".format(fold))

    y_pred_classify = trainer.predict(test_dataset)

    y_pred = np.argmax(y_pred_classify.predictions, axis=-1)
    y_true = y_test_fold

    cf = confusion_matrix(y_true, y_pred)
    evaluation = f1_score(y_true, y_pred, average='macro')
    evaluation2 = accuracy_score(y_true, y_pred)
    evaluation3 = precision_score(y_true, y_pred, average='macro')
    evaluation4 = recall_score(y_true, y_pred, average='macro')

    print('===============================================')
    print("FOLD {}: F1 macro - {}, Accuracy - {}, Precision - {}, Recall  - {}".format(fold, evaluation, evaluation2, evaluation3, evaluation4))
    print(cf)
    confuses.append(cf)
    results.append(evaluation)
    results2.append(evaluation2)
    results3.append(evaluation3)
    results4.append(evaluation4)
    print('===============================================')


    fold = fold + 1

print("average F1-macro: {}".format(str(np.mean(results))))
print("average Accuracy: {}".format(str(np.mean(results2))))
print("average Precision: {}".format(str(np.mean(results3))))
print("average Recall: {}".format(str(np.mean(results4))))

np.set_printoptions(suppress=True)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Step,Training Loss
500,1.1466


FOLD 0: F1 macro - 0.8394670940162466, Accuracy - 0.8443514644351464, Precision - 0.8409645931894726, Recall  - 0.8390626663332963
[[197  28   6   0   2]
 [ 36 175  11   3   0]
 [  7  14 190   5   5]
 [  1   0  21 170  21]
 [  3   0   5  18 277]]


Step,Training Loss
500,0.2605


FOLD 1: F1 macro - 0.8481878410378432, Accuracy - 0.8527196652719665, Precision - 0.8509036759957287, Recall  - 0.846939731174564
[[199  26   4   1   3]
 [ 37 174   7   6   1]
 [  9  13 182  11   6]
 [  2   0   7 181  23]
 [  2   0   2  16 283]]


Step,Training Loss
500,0.1459


FOLD 2: F1 macro - 0.8607320457191137, Accuracy - 0.8652719665271966, Precision - 0.8625232972935025, Recall  - 0.8595692875146399
[[194  27   9   1   2]
 [ 34 180   7   3   1]
 [  6  10 195   6   4]
 [  1   1  11 178  22]
 [  1   0   3  12 287]]


Step,Training Loss
500,0.1242


FOLD 3: F1 macro - 0.8504385265709951, Accuracy - 0.8543933054393306, Precision - 0.8521737160901248, Recall  - 0.84970977294447
[[198  22   5   4   4]
 [ 36 176   8   5   0]
 [  7   9 188  11   6]
 [  1   2   7 180  23]
 [  1   0   2  21 279]]


Step,Training Loss
500,0.0948


FOLD 4: F1 macro - 0.8654590247945568, Accuracy - 0.8686192468619247, Precision - 0.8657644017317493, Recall  - 0.8662819091510237
[[196  28   6   1   2]
 [ 25 190   4   6   0]
 [  5  13 187  10   6]
 [  1   1   9 190  12]
 [  1   0   2  25 275]]
average F1-macro: 0.8528569064277512
average Accuracy: 0.857071129707113
average Precision: 0.8544659368601156
average Recall: 0.8523126734235987
average conf mat: [[196.8  26.2   6.    1.4   2.6]
 [ 33.6 179.    7.4   4.6   0.4]
 [  6.8  11.8 188.4   8.6   5.4]
 [  1.2   0.8  11.  179.8  20.2]
 [  1.6   0.    2.8  18.4 280.2]]


In [None]:
# distilBERT
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import torch

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

results = []
results2 = []
results3 = []
results4 = []

confuses = []
fold = 0

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 6)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased",use_fast=False)

kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

for train, test in kfold.split(X, y):
    X_train_fold = [t.lower() for t in X[train]]
    y_train_fold = y[train]

    # Keep the test set ratio
    X_test_fold = [t.lower() for t in X[test]]
    y_test_fold = y[test]

    train_encodings = tokenizer(X_train_fold, truncation=True, padding=True, max_length=50)
    test_encodings = tokenizer(X_test_fold, truncation=True, padding=True, max_length=50)

    train_dataset = BuildDataset(train_encodings, y_train_fold.tolist())
    test_dataset = BuildDataset(test_encodings, y_test_fold.tolist())

    training_args = TrainingArguments(
        output_dir="drive/MyDrive/CODE/SentProd/model/xlm-r-fold{}/".format(fold),          
        num_train_epochs=3,              
        per_device_train_batch_size=16,  
        per_device_eval_batch_size=16,   
        warmup_steps=500,                
        weight_decay=0.01,
        no_cuda=False,
        do_eval=False
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,                  
    )

    trainer.train()
    trainer.save_model("drive/MyDrive/CODE/SentProd/model/xlm-r-fold{}/".format(fold))

    y_pred_classify = trainer.predict(test_dataset)

    y_pred = np.argmax(y_pred_classify.predictions, axis=-1)
    y_true = y_test_fold

    cf = confusion_matrix(y_true, y_pred)
    evaluation = f1_score(y_true, y_pred, average='macro')
    evaluation2 = accuracy_score(y_true, y_pred)
    evaluation3 = precision_score(y_true, y_pred, average='macro')
    evaluation4 = recall_score(y_true, y_pred, average='macro')

    print('===============================================')
    print("FOLD {}: F1 macro - {}, Accuracy - {}, Precision - {}, Recall  - {}".format(fold, evaluation, evaluation2, evaluation3, evaluation4))
    print(cf)
    confuses.append(cf)
    results.append(evaluation)
    results2.append(evaluation2)
    results3.append(evaluation3)
    results4.append(evaluation4)
    print('===============================================')


    fold = fold + 1

print("average F1-macro: {}".format(str(np.mean(results))))
print("average Accuracy: {}".format(str(np.mean(results2))))
print("average Precision: {}".format(str(np.mean(results3))))
print("average Recall: {}".format(str(np.mean(results4))))

np.set_printoptions(suppress=True)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Step,Training Loss
500,1.1991


FOLD 0: F1 macro - 0.7932667314219419, Accuracy - 0.796652719665272, Precision - 0.7951100807113731, Recall  - 0.7933364417662433
[[173  38  14   1   7]
 [ 28 179  14   3   1]
 [ 13  19 184   2   3]
 [  2   1  16 160  34]
 [  6   2  13  26 256]]


Step,Training Loss
500,0.3849


FOLD 1: F1 macro - 0.851918084675869, Accuracy - 0.8543933054393306, Precision - 0.851788853111643, Recall  - 0.8522759942452943
[[191  33   6   0   3]
 [ 25 189   5   4   2]
 [  7  12 192   7   3]
 [  0   1  11 180  21]
 [  6   0   4  24 269]]


Step,Training Loss
500,0.1365


FOLD 2: F1 macro - 0.8579827869228778, Accuracy - 0.8602510460251046, Precision - 0.8579831620848444, Recall  - 0.8583430500834417
[[190  31   6   1   5]
 [ 31 184   4   3   3]
 [  7  11 192   8   3]
 [  0   0   7 190  16]
 [  6   1   3  21 272]]


Step,Training Loss
500,0.0722


FOLD 3: F1 macro - 0.8419793029344363, Accuracy - 0.8460251046025105, Precision - 0.8419020883524787, Recall  - 0.8421360501136794
[[188  30  10   3   2]
 [ 32 183   4   4   2]
 [  6  15 189   8   3]
 [  0   2  15 177  19]
 [  5   1   1  22 274]]


Step,Training Loss
500,0.0652


FOLD 4: F1 macro - 0.84597391790385, Accuracy - 0.8493723849372385, Precision - 0.8467618203787127, Recall  - 0.8455624309636587
[[191  29  10   0   3]
 [ 32 180   5   5   3]
 [  5  10 196   5   5]
 [  0   2  14 174  23]
 [  5   2   4  18 274]]
average F1-macro: 0.8382241647717951
average Accuracy: 0.8413389121338911
average Precision: 0.8387092009278104
average Recall: 0.8383307934344636
average conf mat: [[186.6  32.2   9.2   1.    4. ]
 [ 29.6 183.    6.4   3.8   2.2]
 [  7.6  13.4 190.6   6.    3.4]
 [  0.4   1.2  12.6 176.2  22.6]
 [  5.6   1.2   5.   22.2 269. ]]


# 3. Monolingual transformers

In [None]:
# PhoBERT
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import torch

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

results = []
results2 = []
results3 = []
results4 = []

confuses = []
fold = 0

model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels = 6)
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base",use_fast=False)

kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

for train, test in kfold.split(X, y):
    X_train_fold = [t.lower() for t in X[train]]
    y_train_fold = y[train]

    # Keep the test set ratio
    X_test_fold = [t.lower() for t in X[test]]
    y_test_fold = y[test]

    train_encodings = tokenizer(X_train_fold, truncation=True, padding=True, max_length=50)
    test_encodings = tokenizer(X_test_fold, truncation=True, padding=True, max_length=50)

    train_dataset = BuildDataset(train_encodings, y_train_fold.tolist())
    test_dataset = BuildDataset(test_encodings, y_test_fold.tolist())

    training_args = TrainingArguments(
        output_dir="drive/MyDrive/CODE/SentProd/model/phobert-fold{}/".format(fold),          
        num_train_epochs=3,              
        per_device_train_batch_size=16,  
        per_device_eval_batch_size=16,   
        warmup_steps=500,                
        weight_decay=0.01,
        no_cuda=False,
        do_eval=False
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,                  
    )

    trainer.train()
    trainer.save_model("drive/MyDrive/CODE/SentProd/model/phobert-fold{}/".format(fold))

    y_pred_classify = trainer.predict(test_dataset)

    y_pred = np.argmax(y_pred_classify.predictions, axis=-1)
    y_true = y_test_fold

    cf = confusion_matrix(y_true, y_pred)
    evaluation = f1_score(y_true, y_pred, average='macro')
    evaluation2 = accuracy_score(y_true, y_pred)
    evaluation3 = precision_score(y_true, y_pred, average='macro')
    evaluation4 = recall_score(y_true, y_pred, average='macro')

    print('===============================================')
    print("FOLD {}: F1 macro - {}, Accuracy - {}, Precision - {}, Recall  - {}".format(fold, evaluation, evaluation2, evaluation3, evaluation4))
    print(cf)
    confuses.append(cf)
    results.append(evaluation)
    results2.append(evaluation2)
    results3.append(evaluation3)
    results4.append(evaluation4)
    print('===============================================')


    fold = fold + 1

print("average F1-macro: {}".format(str(np.mean(results))))
print("average Accuracy: {}".format(str(np.mean(results2))))
print("average Precision: {}".format(str(np.mean(results3))))
print("average Recall: {}".format(str(np.mean(results4))))

np.set_printoptions(suppress=True)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

Downloading:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Step,Training Loss
500,1.1379


FOLD 0: F1 macro - 0.8679621994341866, Accuracy - 0.8711297071129707, Precision - 0.8684403620940108, Recall  - 0.868105172701984
[[191  33   7   1   1]
 [ 20 191  10   3   1]
 [  2   5 202   9   3]
 [  1   4   7 179  22]
 [  5   0   4  16 278]]


Step,Training Loss
500,0.2663


FOLD 1: F1 macro - 0.9176276782055712, Accuracy - 0.9196652719665271, Precision - 0.9173812603459328, Recall  - 0.9184165432804481
[[207  22   4   0   0]
 [ 14 203   6   2   0]
 [  2   4 211   0   4]
 [  0   2  10 193   8]
 [  0   0   2  16 285]]


Step,Training Loss
500,0.0981


FOLD 2: F1 macro - 0.902792150304275, Accuracy - 0.9054393305439331, Precision - 0.9026626262737928, Recall  - 0.9032138312931595
[[210  17   5   1   0]
 [ 23 194   4   4   0]
 [  3   5 203   6   4]
 [  0   0  10 191  12]
 [  0   0   0  19 284]]


Step,Training Loss
500,0.0744


FOLD 3: F1 macro - 0.905005821276075, Accuracy - 0.907112970711297, Precision - 0.9048668294785795, Recall  - 0.905335019751248
[[209  19   5   0   0]
 [ 19 197   5   2   2]
 [  2   7 202   7   3]
 [  0   1   6 193  13]
 [  0   0   0  20 283]]


Step,Training Loss
500,0.0686


FOLD 4: F1 macro - 0.8923331616210467, Accuracy - 0.895397489539749, Precision - 0.8923810536861165, Recall  - 0.8927757903990786
[[211  17   5   0   0]
 [ 25 189   7   4   0]
 [  2   6 200  10   3]
 [  0   1   9 188  15]
 [  0   0   0  21 282]]
average F1-macro: 0.8971442021682309
average Accuracy: 0.8997489539748955
average Precision: 0.8971464263756864
average Recall: 0.8975692714851837
average conf mat: [[205.6  21.6   5.2   0.4   0.2]
 [ 20.2 194.8   6.4   3.    0.6]
 [  2.2   5.4 203.6   6.4   3.4]
 [  0.2   1.6   8.4 188.8  14. ]
 [  1.    0.    1.2  18.4 282.4]]


In [None]:
# ViBERT4News
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import torch

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

results = []
results2 = []
results3 = []
results4 = []

confuses = []
fold = 0

model = AutoModelForSequenceClassification.from_pretrained("NlpHUST/vibert4news-base-cased", num_labels = 6)
tokenizer = AutoTokenizer.from_pretrained("NlpHUST/vibert4news-base-cased",use_fast=False)

kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

for train, test in kfold.split(X, y):
    X_train_fold = [t.lower() for t in X[train]]
    y_train_fold = y[train]

    # Keep the test set ratio
    X_test_fold = [t.lower() for t in X[test]]
    y_test_fold = y[test]

    train_encodings = tokenizer(X_train_fold, truncation=True, padding=True, max_length=50)
    test_encodings = tokenizer(X_test_fold, truncation=True, padding=True, max_length=50)

    train_dataset = BuildDataset(train_encodings, y_train_fold.tolist())
    test_dataset = BuildDataset(test_encodings, y_test_fold.tolist())

    training_args = TrainingArguments(
        output_dir="drive/MyDrive/CODE/SentProd/model/vibert4news-fold{}/".format(fold),          
        num_train_epochs=3,              
        per_device_train_batch_size=16,  
        per_device_eval_batch_size=16,   
        warmup_steps=500,                
        weight_decay=0.01,
        no_cuda=False,
        do_eval=False
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,                  
    )

    trainer.train()
    trainer.save_model("drive/MyDrive/CODE/SentProd/model/vibert4news-fold{}/".format(fold))

    y_pred_classify = trainer.predict(test_dataset)

    y_pred = np.argmax(y_pred_classify.predictions, axis=-1)
    y_true = y_test_fold

    cf = confusion_matrix(y_true, y_pred)
    evaluation = f1_score(y_true, y_pred, average='macro')
    evaluation2 = accuracy_score(y_true, y_pred)
    evaluation3 = precision_score(y_true, y_pred, average='macro')
    evaluation4 = recall_score(y_true, y_pred, average='macro')

    print('===============================================')
    print("FOLD {}: F1 macro - {}, Accuracy - {}, Precision - {}, Recall  - {}".format(fold, evaluation, evaluation2, evaluation3, evaluation4))
    print(cf)
    confuses.append(cf)
    results.append(evaluation)
    results2.append(evaluation2)
    results3.append(evaluation3)
    results4.append(evaluation4)
    print('===============================================')


    fold = fold + 1

print("average F1-macro: {}".format(str(np.mean(results))))
print("average Accuracy: {}".format(str(np.mean(results2))))
print("average Precision: {}".format(str(np.mean(results3))))
print("average Recall: {}".format(str(np.mean(results4))))

np.set_printoptions(suppress=True)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))

Downloading:   0%|          | 0.00/551 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/537M [00:00<?, ?B/s]

Some weights of the model checkpoint at NlpHUST/vibert4news-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Downloading:   0%|          | 0.00/411k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Step,Training Loss
500,0.8428


FOLD 0: F1 macro - 0.8544969437208542, Accuracy - 0.8577405857740585, Precision - 0.854170245639177, Recall  - 0.8549471783956702
[[200  29   3   1   0]
 [ 31 183   6   4   1]
 [  6   9 195   7   4]
 [  1   3  13 176  20]
 [  2   1   4  25 271]]


Step,Training Loss
500,0.126


FOLD 1: F1 macro - 0.8552938535986747, Accuracy - 0.8594142259414226, Precision - 0.857755573153194, Recall  - 0.8540856281978693
[[205  24   3   1   0]
 [ 39 176   8   1   1]
 [  3  10 192   8   8]
 [  1   2  12 173  25]
 [  3   0   2  17 281]]


Step,Training Loss
500,0.0661


FOLD 2: F1 macro - 0.8555225380050622, Accuracy - 0.8594142259414226, Precision - 0.8578818398872686, Recall  - 0.854167841912836
[[205  23   4   1   0]
 [ 33 184   6   2   0]
 [  4  14 186   9   8]
 [  2   3  10 172  26]
 [  3   1   2  17 280]]


Step,Training Loss
500,0.0599


FOLD 3: F1 macro - 0.8533189570163617, Accuracy - 0.8577405857740585, Precision - 0.8546583275319959, Recall  - 0.8526587524630855
[[201  25   6   1   0]
 [ 35 179   9   2   0]
 [  6  10 192   8   5]
 [  1   3  14 173  22]
 [  3   4   1  15 280]]


Step,Training Loss
500,0.0623


FOLD 4: F1 macro - 0.8528314857367401, Accuracy - 0.8560669456066946, Precision - 0.8542370141770295, Recall  - 0.8519499709266529
[[202  24   7   0   0]
 [ 38 177   5   4   1]
 [  4  12 191   4  10]
 [  0   6   8 177  22]
 [  2   2   3  20 276]]
average F1-macro: 0.8542927556155385
average Accuracy: 0.8580753138075312
average Precision: 0.855740600077733
average Recall: 0.8535618743792229
average conf mat: [[202.6  25.    4.6   0.8   0. ]
 [ 35.2 179.8   6.8   2.6   0.6]
 [  4.6  11.  191.2   7.2   7. ]
 [  1.    3.4  11.4 174.2  23. ]
 [  2.6   1.6   2.4  18.8 277.6]]


In [None]:
# VELECTRA
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import torch

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

results = []
results2 = []
results3 = []
results4 = []

confuses = []
fold = 0

model = AutoModelForSequenceClassification.from_pretrained("FPTAI/velectra-base-discriminator-cased", num_labels = 6)
tokenizer = AutoTokenizer.from_pretrained("FPTAI/velectra-base-discriminator-cased",use_fast=False)

kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

for train, test in kfold.split(X, y):
    X_train_fold = [t.lower() for t in X[train]]
    y_train_fold = y[train]

    # Keep the test set ratio
    X_test_fold = [t.lower() for t in X[test]]
    y_test_fold = y[test]

    train_encodings = tokenizer(X_train_fold, truncation=True, padding=True, max_length=50)
    test_encodings = tokenizer(X_test_fold, truncation=True, padding=True, max_length=50)

    train_dataset = BuildDataset(train_encodings, y_train_fold.tolist())
    test_dataset = BuildDataset(test_encodings, y_test_fold.tolist())

    training_args = TrainingArguments(
        output_dir="drive/MyDrive/CODE/SentProd/model/velectra-fold{}/".format(fold),          
        num_train_epochs=3,              
        per_device_train_batch_size=16,  
        per_device_eval_batch_size=16,   
        warmup_steps=500,                
        weight_decay=0.01,
        no_cuda=False,
        do_eval=False
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,                  
    )

    trainer.train()
    trainer.save_model("drive/MyDrive/CODE/SentProd/model/velectra-fold{}/".format(fold))

    y_pred_classify = trainer.predict(test_dataset)

    y_pred = np.argmax(y_pred_classify.predictions, axis=-1)
    y_true = y_test_fold

    cf = confusion_matrix(y_true, y_pred)
    evaluation = f1_score(y_true, y_pred, average='macro')
    evaluation2 = accuracy_score(y_true, y_pred)
    evaluation3 = precision_score(y_true, y_pred, average='macro')
    evaluation4 = recall_score(y_true, y_pred, average='macro')

    print('===============================================')
    print("FOLD {}: F1 macro - {}, Accuracy - {}, Precision - {}, Recall  - {}".format(fold, evaluation, evaluation2, evaluation3, evaluation4))
    print(cf)
    confuses.append(cf)
    results.append(evaluation)
    results2.append(evaluation2)
    results3.append(evaluation3)
    results4.append(evaluation4)
    print('===============================================')


    fold = fold + 1

print("average F1-macro: {}".format(str(np.mean(results))))
print("average Accuracy: {}".format(str(np.mean(results2))))
print("average Precision: {}".format(str(np.mean(results3))))
print("average Recall: {}".format(str(np.mean(results4))))

np.set_printoptions(suppress=True)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))

Downloading:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the model checkpoint at FPTAI/velectra-base-discriminator-cased were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at FPTAI/velectra-base-discriminator-cased and are newly initialized: ['

Downloading:   0%|          | 0.00/201k [00:00<?, ?B/s]

Step,Training Loss
500,1.0903


FOLD 0: F1 macro - 0.8142931748404756, Accuracy - 0.8184100418410042, Precision - 0.8183729180440338, Recall  - 0.8120763842861501
[[190  28   7   0   8]
 [ 43 164   6   5   7]
 [  6  10 185  13   7]
 [  1   2  10 165  35]
 [  4   1   6  18 274]]


Step,Training Loss
500,0.1755


FOLD 1: F1 macro - 0.8251771663947048, Accuracy - 0.8284518828451883, Precision - 0.8274492732646859, Recall  - 0.8235817642383992
[[189  29   7   0   8]
 [ 33 172   6   6   8]
 [  8   9 185  12   7]
 [  1   2  10 172  28]
 [  3   2   5  21 272]]


Step,Training Loss
500,0.1037


FOLD 2: F1 macro - 0.8216058358122101, Accuracy - 0.8251046025104602, Precision - 0.8233008626754232, Recall  - 0.8206942201179495
[[180  38   9   0   6]
 [ 30 172  11   6   6]
 [  5   9 189   9   9]
 [  0   2   8 174  29]
 [  1   0   5  26 271]]


Step,Training Loss
500,0.0765


FOLD 3: F1 macro - 0.8294139967395047, Accuracy - 0.8334728033472804, Precision - 0.8314844638895004, Recall  - 0.8291070581665272
[[197  19  11   0   6]
 [ 41 167  10   2   5]
 [  6   9 193   8   5]
 [  0   5  14 169  25]
 [  3   1   5  24 270]]


Step,Training Loss
500,0.0606


FOLD 4: F1 macro - 0.8136064133896379, Accuracy - 0.8175732217573222, Precision - 0.8155725273663765, Recall  - 0.812254081407876
[[182  35   9   1   6]
 [ 31 174   8   7   5]
 [  7  13 181  13   7]
 [  3   2   7 169  32]
 [  2   0   5  25 271]]
average F1-macro: 0.8208193174353067
average Accuracy: 0.8246025104602509
average Precision: 0.823236009048004
average Recall: 0.8195427016433804
average conf mat: [[187.6  29.8   8.6   0.2   6.8]
 [ 35.6 169.8   8.2   5.2   6.2]
 [  6.4  10.  186.6  11.    7. ]
 [  1.    2.6   9.8 169.8  29.8]
 [  2.6   0.8   5.2  22.8 271.6]]
