<a target="_blank" href="https://colab.research.google.com/github/shayongithub/vietnamese-mtl-model-for-sa-nli-tasks/blob/main/notebooks/Zero-shot%20Topic%20Classification.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch

if torch.cuda.is_available():
    device = torch.cuda.get_device_name(0)  # Get the name of the first GPU
    print(f"PyTorch is running on GPU: {device}")
else:
    print("PyTorch is running on CPU")


PyTorch is running on GPU: Tesla T4


In [3]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
!pip install transformers accelerate evaluate datasets huggingface_hub

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.6

In [None]:
!CUDA_LAUNCH_BLOCKING=1

## **Import dataset**

In [5]:
from datasets import load_dataset, load_from_disk

# Load the Vietnamese version of the XNLI dataset
# dataset = load_dataset('xnli', 'vi')

# ViNLI dataset
raw_datasets = load_from_disk("/content/drive/MyDrive/Graduation/MTL_Datasets/merged_vi_nli_ds")

# for name in dataset:
#     dataset[name] = dataset[name].rename_columns({'label': 'labels'})

train_dataset = raw_datasets['train']
validation_dataset = raw_datasets['validation']
# test_dataset = dataset['test']

In [None]:
train_dataset.features

{'labels': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}

## **Load tokenizer**

In [15]:
tokenized_train_dataset["sentence1"][0]

'các bác sĩ gửi túi thuốc do người nhà cung cấp tới viện pháp y quốc gia để phân tích thành phần'

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2', max_length=512)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Tokenize the data
def pre_process_and_tokenize(batch):
    return tokenizer(batch['sentence1'], batch['sentence2'], truncation=True, padding=True)


tokenized_train_dataset = train_dataset.map(pre_process_and_tokenize, batched=True)
tokenized_validation_dataset = validation_dataset.map(pre_process_and_tokenize, batched=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/20544 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2289 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification

from transformers import set_seed

set_seed(42)

# Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('vinai/phobert-base-v2', num_labels=3)

Downloading pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Gradual freezing**
Freeze the pretrained model to only train the new part of model that we have just initialized.

In [None]:
# for name, param in model.named_parameters():
#      if name.startswith("roberta"):
#         param.requires_grad = False
#         print(name)
#      else:
#        print("NO", name)

### **Compute metrics**

In [8]:
import numpy as np
from datasets import load_metric
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1", average="macro")
precision_metric = evaluate.load("precision", average="macro")
recall_metric = evaluate.load("recall", average="macro")

def compute_metrics(eval_preds):

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [None]:
train_ind_rans = random.sample(
    range(len(raw_datasets["train"])), 100
)

tokenized_train_dataset = tokenized_train_dataset.select(train_ind_rans)


In [17]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy
import random


training_args = TrainingArguments(
    # do_eval=True,
    output_dir="./runs",
    #   evaluation_strategy = IntervalStrategy.STEPS,
    per_device_train_batch_size=16,
    #   per_device_eval_batch_size=16,
    #   eval_steps = 1,
    save_steps=3000,
    logging_steps=3000,
    learning_rate=5e-5,
    fp16=True,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy=IntervalStrategy.EPOCH,
    load_best_model_at_end=False,
    metric_for_best_model="f1",
    optim="adamw_torch",
    # resume_from_checkpoint=True,
    remove_unused_columns=True,
    # push_to_hub=True
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_validation_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
  #  callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
)

In [18]:
trainer.train()

# trainer.save_model("./drive/MyDrive/Shay/models/phobert_v2_zsl_topic_identification_model_50_epochs_512")

Step,Training Loss


TrainOutput(global_step=21, training_loss=1.0933983212425595, metrics={'train_runtime': 38.2081, 'train_samples_per_second': 7.852, 'train_steps_per_second': 0.55, 'total_flos': 21772635316992.0, 'train_loss': 1.0933983212425595, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.0956807136535645,
 'eval_accuracy': 0.7129750982961992,
 'eval_precision': 0.7129925981824045,
 'eval_recall': 0.7129326942240328,
 'eval_f1': 0.7127720612394036,
 'eval_runtime': 9.5966,
 'eval_samples_per_second': 238.521,
 'eval_steps_per_second': 29.906,
 'epoch': 20.0}

In [None]:
tokenized_test = test_dataset.map(pre_process_and_tokenize, batched=True)
trainer.predict(tokenized_test)

Map:   0%|          | 0/5010 [00:00<?, ? examples/s]

PredictionOutput(predictions=array([[-0.39916992, -1.2099609 ,  1.2988281 ],
       [ 0.40966797,  0.12084961, -0.39013672],
       [ 0.2097168 , -0.23120117,  0.24572754],
       ...,
       [ 0.6616211 ,  0.13989258, -0.6669922 ],
       [ 0.28735352, -0.15722656,  0.07263184],
       [ 0.5864258 , -0.78027344,  0.30810547]], dtype=float32), label_ids=array([2, 0, 1, ..., 1, 2, 0]), metrics={'test_loss': 0.827395498752594, 'test_accuracy': 0.6712574850299401, 'test_precision': 0.6737762821032501, 'test_recall': 0.6712574850299401, 'test_f1': 0.6710105833466993, 'test_runtime': 8.8889, 'test_samples_per_second': 563.626, 'test_steps_per_second': 17.663})

In [None]:
import torch
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import pipeline


tokenizer = AutoTokenizer.from_pretrained('Fsoft-AIC/videberta-xsmall')
classifier = pipeline(model="./drive/MyDrive/Shay/models/pho_bert_zsl_topic_classification_model",
                      task="zero-shot-classification",
                      tokenizer=tokenizer,
                      label2id={'entailment': 0, 'neutral': 1, 'contradiction': 2})


premise="Tôi không thích đi du lịch vòng quanh thế giới lắm"
candidate_labels = ['du lịch', 'nấu ăn', 'nhảy múa']

classifier(premise, candidate_labels)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'sequence': 'Tôi không thích đi du lịch vòng quanh thế giới lắm',
 'labels': ['du lịch', 'nhảy múa', 'nấu ăn'],
 'scores': [0.39267435669898987, 0.3159923553466797, 0.29133331775665283]}

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.3 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

premise="nhiệt tình giảng dạy gần gũi với sinh viên"
candidate_labels = ['du lịch', 'nấu ăn', 'nhảy múa', 'giáo dục']

classifier(premise, candidate_labels)

{'sequence': 'nhiệt tình giảng dạy gần gũi với sinh viên',
 'labels': ['giáo dục', 'nấu ăn', 'nhảy múa', 'du lịch'],
 'scores': [0.36039724946022034,
  0.26783210039138794,
  0.2303902506828308,
  0.14138038456439972]}