In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch

if torch.cuda.is_available():
    device = torch.cuda.get_device_name(0)  # Get the name of the first GPU
    print(f"PyTorch is running on GPU: {device}")
else:
    print("PyTorch is running on CPU")


PyTorch is running on GPU: Tesla T4


In [3]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
!pip install transformers accelerate evaluate datasets huggingface_hub



In [5]:
!CUDA_LAUNCH_BLOCKING=1

## **Import dataset**

In [6]:
from datasets import load_dataset

# Load the Vietnamese version of the XNLI dataset
dataset = load_dataset('xnli', 'vi')

for name in dataset:
    dataset[name] = dataset[name].rename_columns({'label': 'labels'})

train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']



  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
train_dataset.features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}

## **Load tokenizer**

In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Tokenize the data
def pre_process_and_tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], truncation=True, padding=True)


tokenized_train_dataset = train_dataset.map(pre_process_and_tokenize, batched=True)
tokenized_validation_dataset = validation_dataset.map(pre_process_and_tokenize, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
tokenized_train_dataset.features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [10]:
from transformers import AutoModelForSequenceClassification

from transformers import set_seed

set_seed(42)

# Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('vinai/phobert-base-v2', num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **Gradual freezing**
Freeze the pretrained model to only train the new part of model that we have just initialized.

In [11]:
# for name, param in model.named_parameters():
#      if name.startswith("roberta"):
#         param.requires_grad = False
#         print(name)
#      else:
#        print("NO", name)

### **Compute metrics**

In [12]:
import numpy as np
from datasets import load_metric
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1", average="macro")
precision_metric = evaluate.load("precision", average="macro")
recall_metric = evaluate.load("recall", average="macro")

def compute_metrics(eval_preds):

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [13]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy


training_args = TrainingArguments(
   output_dir='./zero_shot_topic_classification',
   evaluation_strategy = IntervalStrategy.STEPS,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   eval_steps = 500,
   save_steps = 2000,
   logging_steps = 500,
   learning_rate=5e-5,
   label_smoothing_factor=0.1,
   fp16=True,
   num_train_epochs=3,
   weight_decay=0.01,
   save_strategy=IntervalStrategy.STEPS,
   push_to_hub=False,
   load_best_model_at_end = True,
   metric_for_best_model = 'f1',
   optim="adamw_torch"
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_validation_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
   callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
)

In [None]:
trainer.train()

trainer.save_model("./drive/MyDrive/pho_bert_base_v2_zsl_topic_classification_model")

Step,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

{'eval_loss': 0.7267589569091797,
 'eval_accuracy': 0.7670682730923695,
 'eval_precision': 0.7725019523020148,
 'eval_recall': 0.7670682730923696,
 'eval_f1': 0.7670544006100658,
 'eval_runtime': 7.8953,
 'eval_samples_per_second': 315.376,
 'eval_steps_per_second': 4.94,
 'epoch': 3.0}

In [None]:
tokenized_test = test_dataset.map(pre_process_and_tokenize, batched=True)
trainer.predict(tokenized_test)

Map:   0%|          | 0/5010 [00:00<?, ? examples/s]

PredictionOutput(predictions=array([[-1.1748047 , -1.3027344 ,  2.2148438 ],
       [ 1.6533203 ,  0.22717285, -1.4589844 ],
       [-1.1181641 ,  1.9853516 , -0.8027344 ],
       ...,
       [-0.8574219 ,  2.1367188 , -1.1601562 ],
       [ 0.6855469 , -0.796875  ,  0.33374023],
       [ 2.2714844 , -0.86035156, -1.0146484 ]], dtype=float32), label_ids=array([2, 0, 1, ..., 1, 2, 0]), metrics={'test_loss': 0.728074312210083, 'test_accuracy': 0.7650698602794411, 'test_precision': 0.769565232023885, 'test_recall': 0.765069860279441, 'test_f1': 0.7649656917023853, 'test_runtime': 11.0736, 'test_samples_per_second': 452.429, 'test_steps_per_second': 7.134})

In [None]:
trainer.save_model("./drive/MyDrive/Shay/models/pho_bert_zsl_topic_classification_model")