In [None]:
!pip install transformers torch scikit-learn gdown



In [None]:
import os
import gdown
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score

# Mapping labels
label_mapping = {0: "negative", 1: "neutral", 2: "positive"}

# Function to download file from Google Drive
def download_gdrive_file(file_name, drive_link):
  if not os.path.exists(file_name):
    # Convert view link to download link
    file_id = drive_link.split('/d/')[1].split('/')[0]
    download_url = f'https://drive.google.com/uc?export=download&id={file_id}'
    print(f"Downloading {file_name} from {download_url} ...")
    gdown.download(download_url, file_name, quiet=False)
  else:
    print(f"File {file_name} existed.")

In [None]:
# Function to load train data file
def load_train_data(file_name, drive_link):
  download_gdrive_file(file_name, drive_link)
  with open(file_name, "r", encoding="utf-8") as f:
    texts = f.read()
    lines = re.split(r"\n+", texts)
  return [line.strip() for line in lines if line.strip()]

In [None]:
# Function to load test data file
def load_test_data(file_name, drive_link):
  download_gdrive_file(file_name, drive_link)
  with open(file_name, "r", encoding="utf-8") as f:
    texts = f.read()
    lines = re.split(r"\n+", texts)
    lines = [line.strip() for line in lines if line.strip()]

  texts = []
  labels = []
  if len(lines) % 2 != 0:
    print("Warning: The number of lines in the test file is not even, check the file test again!")

  for i in range(0, len(lines) - 1, 2):
    texts.append(lines[i])
    label_str = lines[i + 1].upper()
    if label_str == "POS":
      labels.append(2)
    elif label_str == "NEU":
      labels.append(1)
    elif label_str == "NEG":
      labels.append(0)
    else:
      raise ValueError(f"Invalid label: {label_str}")

  return texts, labels

In [None]:
def clean_text(texts):
    cleaned_texts = []
    for text in texts:
        # Loại bỏ URL (ví dụ: http://... hoặc https://...)
        text = re.sub(r'http\S+', '', text)
        # Loại bỏ khoảng trắng dư thừa (bao gồm newline, tab, ...)
        text = re.sub(r'\s+', ' ', text)
        # Xóa khoảng trắng đầu và cuối câu
        text = text.strip()
        cleaned_texts.append(text)
    return cleaned_texts

In [None]:
# Custom dataset
class SentimentDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length=128):
    self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    if self.labels is not None:
        item["labels"] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
      return len(self.encodings["input_ids"])

In [None]:
# Hàm tính các chỉ số đánh giá (sử dụng khi tập test có ground truth)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
# Đường dẫn file trên Google Drive
train_positive_url = "https://drive.google.com/file/d/1ufy3fwnrh8XVeKOjSfo0QEvg6YD85RGu/view?usp=sharing"
train_neutral_url  = "https://drive.google.com/file/d/1RoRvqXwdvdVAcbnoRZgq6r2YYZk9lAmN/view?usp=sharing"
train_negative_url = "https://drive.google.com/file/d/11GHXGEE5X6QcL1llx4bmYa5hid8Bb8Ja/view?usp=sharing"
test_url           = "https://drive.google.com/file/d/1pudi3cTGxqs85RopswaQu1panBhjB-wG/view?usp=sharing"

train_negative = load_train_data("train_negative_tokenized.txt", train_negative_url)
train_neutral  = load_train_data("train_neutral_tokenized.txt", train_neutral_url)
train_positive = load_train_data("train_positive_tokenized.txt", train_positive_url)

# Gán nhãn: negative -> 0, neutral -> 1, positive -> 2
texts_train = train_negative + train_neutral + train_positive
texts_train = clean_text(texts_train)
labels_train = [0] * len(train_negative) + [1] * len(train_neutral) + [2] * len(train_positive)

from sklearn.model_selection import train_test_split
texts_train, texts_val, labels_train, labels_val = train_test_split(texts_train, labels_train, test_size=0.2, stratify=labels_train, random_state=42)

print(f"Số mẫu train: {len(texts_train)}")
print(f"Số mẫu validation: {len(texts_val)}")

File train_negative_tokenized.txt existed.
File train_neutral_tokenized.txt existed.
File train_positive_tokenized.txt existed.
Số mẫu train: 4080
Số mẫu validation: 1020


In [None]:
texts_test, labels_test = load_test_data("test_tokenized_ANS.txt", test_url)
texts_test = clean_text(texts_test)

print(f"Số mẫu test: {len(texts_test)}")
print("Ví dụ test:")
for i in range(min(3, len(texts_test))):
    print("Text:", texts_test[i])
    print("Label:", labels_test[i], "-", label_mapping[labels_test[i]])
    print("-----")

File test_tokenized_ANS.txt existed.
Số mẫu test: 1050
Ví dụ test:
Text: Không nên mua chuột cua Logitech , vì dùng nó rất khó đổi cái mới . Mình nghe thằng bạn xúi mua con M325 cách đây 5 năm , dù có cơ_số lần rơi_rớt quăng_quật mà đến giờ vẫn chưa hư . Giờ đang thèm em MX_Anywhere_2 này mà chuột cũ chưa hư sao mua chuột mới !
Label: 2 - positive
-----
Text: Nói thiệt là mình thì thì chuột nào mình cũng chơi tuốt , chỉ trừ 1 hãng ra : Razer . Mình đang sở_hữu 1 con DA black , xài được 6 tháng nó bị double click , đem sửa xong xài được them 2 tháng nữa nó bị hư nút cuộn ... Trong khi con SS_Sensei mình xài 3 năm mới bị double click và rít nút cuộn .
Label: 0 - negative
-----
Text: Xai chuot so nhat bi double_click .
Label: 1 - neutral
-----


In [None]:
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset = SentimentDataset(texts_train, labels_train, tokenizer)
val_dataset   = SentimentDataset(texts_val, labels_val, tokenizer)
test_dataset  = SentimentDataset(texts_test, labels_test, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.684861,0.709804,0.706344
2,0.782600,0.603152,0.754902,0.753411
3,0.782600,0.637875,0.752941,0.749194
4,0.463100,0.610569,0.772549,0.774113
5,0.463100,0.698304,0.752941,0.748227
6,0.326400,0.705755,0.759804,0.758483


TrainOutput(global_step=1530, training_loss=0.5193164943869597, metrics={'train_runtime': 686.9547, 'train_samples_per_second': 59.393, 'train_steps_per_second': 3.712, 'total_flos': 1610254116495360.0, 'train_loss': 0.5193164943869597, 'epoch': 6.0})

In [None]:
results = trainer.evaluate(eval_dataset=test_dataset)
print("Kết quả đánh giá trên tập test:")
print(results)

Kết quả đánh giá trên tập test:
{'eval_loss': 0.8318629264831543, 'eval_accuracy': 0.7095238095238096, 'eval_f1': 0.709959920505598, 'eval_runtime': 7.0442, 'eval_samples_per_second': 149.06, 'eval_steps_per_second': 9.369, 'epoch': 6.0}
