In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [None]:
!pip install transformers transformers[torch] accelerate evaluate
!pip install datasets huggingface_hub
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
from datasets import load_dataset

dataset = load_dataset("uit-nlp/vietnamese_students_feedback")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 3166
    })
})

In [None]:
print(dataset["train"][0])
print(dataset["test"][0])

{'sentence': 'slide giáo trình đầy đủ .', 'sentiment': 2, 'topic': 1}
{'sentence': 'nói tiếng anh lưu loát .', 'sentiment': 2, 'topic': 0}


## **Preview Dataset**

In [None]:
import pandas as pd

df_train = pd.DataFrame(dataset["train"])
df_val = pd.DataFrame(dataset["validation"])
df_test = pd.DataFrame(dataset["test"])

df_train.head()

Unnamed: 0,sentence,sentiment,topic
0,slide giáo trình đầy đủ .,2,1
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",2,0
2,đi học đầy đủ full điểm chuyên cần .,0,1
3,chưa áp dụng công nghệ thông tin và các thiết ...,0,0
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",2,0


In [None]:
df_train["sentiment"].value_counts()

2    5643
0    5325
1     458
Name: sentiment, dtype: int64

In [None]:
df_val["sentiment"].value_counts()

2    805
0    705
1     73
Name: sentiment, dtype: int64

In [None]:
df_test["sentiment"].value_counts()

2    1590
0    1409
1     167
Name: sentiment, dtype: int64

## **Modify datasets**

Change it to contains only 2 classes: negative and positive

In [None]:
import pandas as pd

dataset = dataset.remove_columns("topic")

dataset_no_neutral = dataset.filter(lambda row: row["sentiment"] != 1)

series = pd.Series(dataset_no_neutral["train"]["sentiment"])
series.value_counts()



2    5643
0    5325
dtype: int64

In [None]:
from datasets import ClassLabel, Value

new_features = dataset_no_neutral["train"].features.copy()
new_features["sentiment"] = ClassLabel(names=["negative", "positive"])

for data_type in dataset_no_neutral:
    dataset_no_neutral[data_type] = dataset_no_neutral[data_type].cast(new_features)

print(dataset_no_neutral["train"].features)
print(dataset_no_neutral["validation"].features)
print(dataset_no_neutral["test"].features)



{'sentence': Value(dtype='string', id=None), 'sentiment': ClassLabel(names=['negative', 'positive'], id=None)}
{'sentence': Value(dtype='string', id=None), 'sentiment': ClassLabel(names=['negative', 'positive'], id=None)}
{'sentence': Value(dtype='string', id=None), 'sentiment': ClassLabel(names=['negative', 'positive'], id=None)}


In [None]:
def modify_index_class(row):
    print(row)
    row["sentiment"] = 1 if row["sentiment"] == 2 else 0
    return row

updated_dataset = dataset_no_neutral.map(modify_index_class)

series = pd.Series(updated_dataset["train"]["sentiment"])
series.value_counts()



1    5643
0    5325
dtype: int64

In [None]:
for name in updated_dataset:
    updated_dataset[name] = updated_dataset[name].rename_columns({'sentiment': 'labels'})

updated_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 10968
    })
    validation: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 1510
    })
    test: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 2999
    })
})

## **Tokenize the data**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Fsoft-AIC/videberta-xsmall")

In [None]:
tokenizer

DebertaV2TokenizerFast(name_or_path='Fsoft-AIC/videberta-xsmall', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
def preprocess_function(row):
    return tokenizer(row["sentence"], truncation=True)

tokenized_train = updated_dataset['train'].map(preprocess_function, batched=True)
tokenized_val = updated_dataset['validation'].map(preprocess_function, batched=True)



## **Building VideBerta xsmall model**

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification

# 0: negatve, 1: positive
model = AutoModelForSequenceClassification.from_pretrained("Fsoft-AIC/videberta-xsmall", num_labels=2)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at Fsoft-AIC/videberta-xsmall and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import numpy as np
from datasets import load_metric
import evaluate


def compute_metrics(eval_preds):

    metrics = evaluate.combine([
      evaluate.load("accuracy"),
      evaluate.load("precision"),
      evaluate.load("recall"),
      evaluate.load("f1"),
  ])

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return metrics.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy

repo_name = "videberta-sentiment-analysis"

training_args = TrainingArguments(
   evaluation_strategy = IntervalStrategy.STEPS,
   eval_steps = 100,
   save_steps = 200,
   logging_steps = 100,
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=64,
   per_device_eval_batch_size=64,
   num_train_epochs=100,
   weight_decay=0.01,
   save_strategy=IntervalStrategy.STEPS,
   push_to_hub=False,
   load_best_model_at_end = True,
   metric_for_best_model = 'f1'
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_val,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
   callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
)

trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.6152,0.477745,0.800662,0.857955,0.750311,0.80053
200,0.408,0.324123,0.866887,0.894256,0.850932,0.872056
300,0.3268,0.272576,0.895364,0.883749,0.925466,0.904126
400,0.2654,0.229634,0.919868,0.921182,0.929193,0.92517
500,0.253,0.208813,0.915894,0.920596,0.921739,0.921167
600,0.2014,0.231839,0.917219,0.902844,0.946584,0.924196
700,0.1939,0.213131,0.921192,0.922414,0.930435,0.926407
800,0.1698,0.200456,0.931126,0.949936,0.919255,0.934343
900,0.1822,0.224877,0.924503,0.908876,0.954037,0.930909
1000,0.1441,0.203825,0.931126,0.931119,0.940373,0.935723


TrainOutput(global_step=4000, training_loss=0.1243566644191742, metrics={'train_runtime': 743.45, 'train_samples_per_second': 1475.284, 'train_steps_per_second': 23.135, 'total_flos': 2089749285500352.0, 'train_loss': 0.1243566644191742, 'epoch': 23.26})

In [None]:
trainer.evaluate()

{'eval_loss': 0.27870050072669983,
 'eval_accuracy': 0.9470198675496688,
 'eval_precision': 0.9480840543881335,
 'eval_recall': 0.9527950310559006,
 'eval_f1': 0.9504337050805451,
 'eval_runtime': 5.5266,
 'eval_samples_per_second': 273.223,
 'eval_steps_per_second': 4.343,
 'epoch': 23.26}

In [None]:
updated_dataset["test"]

Dataset({
    features: ['sentence', 'sentiment'],
    num_rows: 2999
})

In [None]:
# updated_dataset["test"] = updated_dataset["test"].rename_columns({'sentiment': 'labels'})
tokenized_test = updated_dataset["test"].map(preprocess_function, batched=True)
trainer.predict(tokenized_test)

Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


PredictionOutput(predictions=array([[-3.1065803,  3.1311731],
       [-3.4922683,  3.724635 ],
       [-3.5479822,  3.7160752],
       ...,
       [-3.5284057,  3.7751057],
       [-3.5145226,  3.695965 ],
       [ 1.9687545, -1.9111937]], dtype=float32), label_ids=array([1, 1, 1, ..., 1, 1, 0]), metrics={'test_loss': 0.3060229420661926, 'test_accuracy': 0.934644881627209, 'test_precision': 0.9496774193548387, 'test_recall': 0.9257861635220126, 'test_f1': 0.9375796178343949, 'test_runtime': 6.4053, 'test_samples_per_second': 468.205, 'test_steps_per_second': 7.338})

In [None]:
trainer.push_to_hub()

/content/videberta-sentiment-analysis is already a clone of https://huggingface.co/shayonhuggingface/videberta-sentiment-analysis. Make sure you pull the latest changes with `repo.git_pull()`.
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/270M [00:00<?, ?B/s]

Upload file runs/Jul19_15-59-33_89a4315a1f0e/events.out.tfevents.1689782373.89a4315a1f0e.401.16:   0%|        …

Upload file runs/Jul19_15-59-21_89a4315a1f0e/events.out.tfevents.1689782364.89a4315a1f0e.401.15:   0%|        …

Upload file runs/Jul19_16-21-42_89a4315a1f0e/events.out.tfevents.1689783703.89a4315a1f0e.401.18:   0%|        …

Upload file runs/Jul19_15-56-54_89a4315a1f0e/events.out.tfevents.1689782217.89a4315a1f0e.401.13:   0%|        …

Upload file runs/Jul19_15-57-36_89a4315a1f0e/events.out.tfevents.1689782259.89a4315a1f0e.401.14:   0%|        …

Upload file training_args.bin:   0%|          | 1.00/3.87k [00:00<?, ?B/s]

Upload file runs/Jul19_15-59-33_89a4315a1f0e/events.out.tfevents.1689783552.89a4315a1f0e.401.17:   0%|        …

Upload file runs/Jul19_16-21-42_89a4315a1f0e/events.out.tfevents.1689784772.89a4315a1f0e.401.19:   0%|        …

To https://huggingface.co/shayonhuggingface/videberta-sentiment-analysis
   ec83944..d3d90ae  main -> main

   ec83944..d3d90ae  main -> main

To https://huggingface.co/shayonhuggingface/videberta-sentiment-analysis
   d3d90ae..c7a80c9  main -> main

   d3d90ae..c7a80c9  main -> main



'https://huggingface.co/shayonhuggingface/videberta-sentiment-analysis/commit/d3d90aefc26574b0deba8324d4e17d886450b10e'

In [None]:
from transformers import pipeline

sentiment_model = pipeline(model="shayonhuggingface/videberta-sentiment-analysis")
text = input("Nhập vào 1 câu: ")
prediction = sentiment_model(text)

if prediction[0]['label'] == 'LABEL_0':
    label = 'negative'
else:
    label = 'positive'

print(f"Prediction: {label} - Conf: {prediction[0]['score']}")

Nhập vào 1 câu: Tao rất thích buổi học ngày hôm nay hic hic
Prediction: positive - Conf: 0.7568884491920471


## **Expriment with PhoBertv2 and VideBerta-base**

### **VideBerta-base**

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy


videberta_base = AutoModelForSequenceClassification.from_pretrained("Fsoft-AIC/videberta-base", num_labels=2)
videberta_base_tokenizer = AutoTokenizer.from_pretrained("Fsoft-AIC/videberta-base")
videberta_base_data_collator = DataCollatorWithPadding(tokenizer=videberta_base_tokenizer)


videberta_base_tokenized_train = updated_dataset['train'].map(lambda row: videberta_base_tokenizer(row["sentence"], truncation=True), batched=True)
videberta_base_tokenized_val = updated_dataset['validation'].map(lambda row: videberta_base_tokenizer(row["sentence"], truncation=True), batched=True)


videberta_base_training_args = TrainingArguments(
   output_dir="videberta-base-sentiment-analysis",
   evaluation_strategy = IntervalStrategy.STEPS,
   eval_steps = 100,
   save_steps = 200,
   logging_steps = 100,
   learning_rate=2e-5,
   per_device_train_batch_size=64,
   per_device_eval_batch_size=64,
   num_train_epochs=100,
   weight_decay=0.01,
   save_strategy=IntervalStrategy.STEPS,
   push_to_hub=False,
   load_best_model_at_end = True,
   metric_for_best_model = 'f1'
)

videberta_base_trainer = Trainer(
   model=videberta_base,
   args=videberta_base_training_args,
   train_dataset=videberta_base_tokenized_train,
   eval_dataset=videberta_base_tokenized_val,
   tokenizer=videberta_base_tokenizer,
   data_collator=videberta_base_data_collator,
   compute_metrics=compute_metrics,
   callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)

videberta_base_trainer.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at Fsoft-AIC/videberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1510 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.6909,0.666828,0.588742,0.604545,0.66087,0.631454
200,0.6877,0.670843,0.570861,0.675615,0.375155,0.482428
300,0.6651,0.62367,0.652318,0.753623,0.51677,0.613117
400,0.6079,0.528623,0.729801,0.789781,0.67205,0.726174
500,0.5773,0.504465,0.733775,0.787447,0.685714,0.733068
600,0.5403,0.440277,0.811258,0.825,0.819876,0.82243
700,0.4152,0.396902,0.843709,0.90701,0.787578,0.843085
800,0.3283,0.299228,0.886093,0.919205,0.862112,0.889744
900,0.2767,0.26142,0.901325,0.917303,0.895652,0.906348
1000,0.244,0.247159,0.907947,0.940476,0.88323,0.910955


TrainOutput(global_step=3800, training_loss=0.20111082108397232, metrics={'train_runtime': 1218.7555, 'train_samples_per_second': 899.934, 'train_steps_per_second': 14.113, 'total_flos': 7923912424659072.0, 'train_loss': 0.20111082108397232, 'epoch': 22.09})

In [None]:
videberta_base_trainer.evaluate()

{'eval_loss': 0.2778867781162262,
 'eval_accuracy': 0.9463576158940398,
 'eval_precision': 0.948019801980198,
 'eval_recall': 0.9515527950310559,
 'eval_f1': 0.949783013019219,
 'eval_runtime': 6.4118,
 'eval_samples_per_second': 235.504,
 'eval_steps_per_second': 3.743,
 'epoch': 22.09}

In [None]:
videberta_base_trainer.save_model("./videberta_base_sentiment_analysis_model")

### **PhoBertv2**

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy


pho_bert_v2 = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base-v2", num_labels=2)
pho_bert_v2_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
pho_bert_v2_data_collator = DataCollatorWithPadding(tokenizer=pho_bert_v2_tokenizer)

pho_bert_v2_tokenized_train = updated_dataset['train'].map(lambda row: pho_bert_v2_tokenizer(row["sentence"], truncation=True), batched=True)
pho_bert_v2_tokenized_val = updated_dataset['validation'].map(lambda row: pho_bert_v2_tokenizer(row["sentence"], truncation=True), batched=True)

pho_bert_v2_training_args = TrainingArguments(
   output_dir="pho-bert-v2-sentiment-analysis",
   evaluation_strategy = IntervalStrategy.STEPS,
   eval_steps = 100,
   save_steps = 200,
   logging_steps = 100,
   learning_rate=2e-5,
   per_device_train_batch_size=64,
   per_device_eval_batch_size=64,
   num_train_epochs=100,
   weight_decay=0.01,
   save_strategy=IntervalStrategy.STEPS,
   push_to_hub=False,
   load_best_model_at_end = True,
   metric_for_best_model = 'f1'
)

pho_bert_v2_trainer = Trainer(
   model=pho_bert_v2,
   args=pho_bert_v2_training_args,
   train_dataset=pho_bert_v2_tokenized_train,
   eval_dataset=pho_bert_v2_tokenized_val,
   tokenizer=pho_bert_v2_tokenizer,
   data_collator=pho_bert_v2_data_collator,
   compute_metrics=compute_metrics,
   callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
)

pho_bert_v2_trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/10968 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1510 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.2527,0.114104,0.970199,0.979798,0.963975,0.971822
200,0.1273,0.106487,0.972848,0.973945,0.975155,0.97455
300,0.0976,0.114565,0.972848,0.972772,0.976398,0.974582
400,0.0821,0.119256,0.974172,0.975186,0.976398,0.975791
500,0.071,0.120118,0.970199,0.96683,0.97764,0.972205
600,0.0571,0.123662,0.970861,0.988447,0.956522,0.972222
700,0.0523,0.136,0.970861,0.981037,0.963975,0.972431
800,0.0425,0.125071,0.970861,0.981037,0.963975,0.972431
900,0.0447,0.136959,0.971523,0.984733,0.961491,0.972973
1000,0.0378,0.13277,0.972185,0.983523,0.963975,0.973651


TrainOutput(global_step=1400, training_loss=0.06968579939433507, metrics={'train_runtime': 326.1509, 'train_samples_per_second': 3362.86, 'train_steps_per_second': 52.736, 'total_flos': 2706105426598080.0, 'train_loss': 0.06968579939433507, 'epoch': 8.14})

In [None]:
print(f"VideBerta XSmall F1-Score: {trainer.evaluate()['eval_f1']}")

VideBerta XSmall F1-Score: 0.9479553903345725


In [None]:
print(f"VideBerta XSmall F1-Score: {trainer.evaluate()['eval_f1']}")
print(f"VideBerta Base F1-Score: {videberta_base_trainer.evaluate()['eval_f1']}")
print(f"PhoBertV2 F1-Score: {pho_bert_v2_trainer.evaluate()['eval_f1']}")

VideBerta XSmall F1-Score: 0.9479553903345725


VideBerta Base F1-Score: 0.949783013019219


PhoBertV2 F1-Score: 0.9757914338919925


In [None]:
pho_bert_v2_trainer.save_model("./pho_bert_v2_sentiment_analysis_model")

## **Inference**

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer

pho_bert_v2_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
videberta_base_tokenizer = AutoTokenizer.from_pretrained("Fsoft-AIC/videberta-base")

videberta_xsmall= pipeline(model="shayonhuggingface/videberta-sentiment-analysis")

videberta_base_model = pipeline(model="./videberta_base_sentiment_analysis_model",
                                task="sentiment-analysis",
                                tokenizer=videberta_base_tokenizer)

phobert_v2_model = pipeline(model="./pho_bert_v2_sentiment_analysis_model",
                            task="sentiment-analysis",
                            tokenizer=pho_bert_v2_tokenizer)

text = input("Nhập vào 1 câu: ")
print('\nOutput:')
predictions = {'VideBerta XSmall': videberta_xsmall(text), 'VideBerta Base': videberta_base_model(text), 'PhoBertV2': phobert_v2_model(text)}

for name, prediction in predictions.items():
  if prediction[0]['label'] == 'LABEL_0':
      label = 'negative'
  else:
      label = 'positive'

  print(f"{name} prediction: {label} - Conf: {prediction[0]['score']}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Nhập vào 1 câu: Tôi thật sự không thích lớp học của ông ấy nhưng vì có crush nên tôi thích đi học mỗi ngày 

Output:
VideBerta XSmall prediction: negative - Conf: 0.9883217215538025
VideBerta Base prediction: negative - Conf: 0.9954942464828491
PhoBertV2 prediction: positive - Conf: 0.9069267511367798


In [None]:
!cp -r pho-bert-v2-sentiment-analysis videberta-base-sentiment-analysis videberta-sentiment-analysis ./drive/MyDrive/Shay/checkpoints