In [None]:
!pip install transformers torch scikit-learn gdown



In [None]:
import os
import gdown
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score

# Mapping labels
label_mapping = {0: "negative", 1: "neutral", 2: "positive"}

# Function to download file from Google Drive
def download_gdrive_file(file_name, drive_link):
  if not os.path.exists(file_name):
    # Convert view link to download link
    file_id = drive_link.split('/d/')[1].split('/')[0]
    download_url = f'https://drive.google.com/uc?export=download&id={file_id}'
    print(f"Downloading {file_name} from {download_url} ...")
    gdown.download(download_url, file_name, quiet=False)
  else:
    print(f"File {file_name} existed.")

In [None]:
# Function to load train data file
def load_train_data(file_name, drive_link):
  download_gdrive_file(file_name, drive_link)
  with open(file_name, "r", encoding="utf-8") as f:
    texts = f.read()
    lines = re.split(r"\n+", texts)
  return [line.strip() for line in lines if line.strip()]

In [None]:
# Function to load test data file
def load_test_data(file_name, drive_link):
  download_gdrive_file(file_name, drive_link)
  with open(file_name, "r", encoding="utf-8") as f:
    texts = f.read()
    lines = re.split(r"\n+", texts)
    lines = [line.strip() for line in lines if line.strip()]

  texts = []
  labels = []
  if len(lines) % 2 != 0:
    print("Warning: The number of lines in the test file is not even, check the file test again!")

  for i in range(0, len(lines) - 1, 2):
    texts.append(lines[i])
    label_str = lines[i + 1].upper()
    if label_str == "POS":
      labels.append(2)
    elif label_str == "NEU":
      labels.append(1)
    elif label_str == "NEG":
      labels.append(0)
    else:
      raise ValueError(f"Invalid label: {label_str}")

  return texts, labels

In [None]:
def clean_text(texts):
    cleaned_texts = []
    for text in texts:
        # Lo·∫°i b·ªè URL (v√≠ d·ª•: http://... ho·∫∑c https://...)
        text = re.sub(r'http\S+', '', text)
        # Lo·∫°i b·ªè kho·∫£ng tr·∫Øng d∆∞ th·ª´a (bao g·ªìm newline, tab, ...)
        text = re.sub(r'\s+', ' ', text)
        # X√≥a kho·∫£ng tr·∫Øng ƒë·∫ßu v√† cu·ªëi c√¢u
        text = text.strip()
        cleaned_texts.append(text)
    return cleaned_texts

In [None]:
# Custom dataset
class SentimentDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length=128):
    self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    if self.labels is not None:
        item["labels"] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
      return len(self.encodings["input_ids"])

In [None]:
# H√†m t√≠nh c√°c ch·ªâ s·ªë ƒë√°nh gi√° (s·ª≠ d·ª•ng khi t·∫≠p test c√≥ ground truth)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
# ƒê∆∞·ªùng d·∫´n file tr√™n Google Drive
train_positive_url = "https://drive.google.com/file/d/1ufy3fwnrh8XVeKOjSfo0QEvg6YD85RGu/view?usp=sharing"
train_neutral_url  = "https://drive.google.com/file/d/1RoRvqXwdvdVAcbnoRZgq6r2YYZk9lAmN/view?usp=sharing"
train_negative_url = "https://drive.google.com/file/d/11GHXGEE5X6QcL1llx4bmYa5hid8Bb8Ja/view?usp=sharing"
test_url           = "https://drive.google.com/file/d/1pudi3cTGxqs85RopswaQu1panBhjB-wG/view?usp=sharing"

train_negative = load_train_data("train_negative_tokenized.txt", train_negative_url)
train_neutral  = load_train_data("train_neutral_tokenized.txt", train_neutral_url)
train_positive = load_train_data("train_positive_tokenized.txt", train_positive_url)

# G√°n nh√£n: negative -> 0, neutral -> 1, positive -> 2
texts_train = train_negative + train_neutral + train_positive
texts_train = clean_text(texts_train)
labels_train = [0] * len(train_negative) + [1] * len(train_neutral) + [2] * len(train_positive)

from sklearn.model_selection import train_test_split
texts_train, texts_val, labels_train, labels_val = train_test_split(texts_train, labels_train, test_size=0.2, stratify=labels_train, random_state=42)

print(f"S·ªë m·∫´u train: {len(texts_train)}")
print(f"S·ªë m·∫´u validation: {len(texts_val)}")

File train_negative_tokenized.txt existed.
File train_neutral_tokenized.txt existed.
File train_positive_tokenized.txt existed.
S·ªë m·∫´u train: 4080
S·ªë m·∫´u validation: 1020


In [None]:
texts_test, labels_test = load_test_data("test_tokenized_ANS.txt", test_url)
texts_test = clean_text(texts_test)

print(f"S·ªë m·∫´u test: {len(texts_test)}")
print("V√≠ d·ª• test:")
for i in range(min(3, len(texts_test))):
    print("Text:", texts_test[i])
    print("Label:", labels_test[i], "-", label_mapping[labels_test[i]])
    print("-----")

File test_tokenized_ANS.txt existed.
S·ªë m·∫´u test: 1050
V√≠ d·ª• test:
Text: Kh√¥ng n√™n mua chu·ªôt cua Logitech , v√¨ d√πng n√≥ r·∫•t kh√≥ ƒë·ªïi c√°i m·ªõi . M√¨nh nghe th·∫±ng b·∫°n x√∫i mua con M325 c√°ch ƒë√¢y 5 nƒÉm , d√π c√≥ c∆°_s·ªë l·∫ßn r∆°i_r·ªõt quƒÉng_qu·∫≠t m√† ƒë·∫øn gi·ªù v·∫´n ch∆∞a h∆∞ . Gi·ªù ƒëang th√®m em MX_Anywhere_2 n√†y m√† chu·ªôt c≈© ch∆∞a h∆∞ sao mua chu·ªôt m·ªõi !
Label: 2 - positive
-----
Text: N√≥i thi·ªát l√† m√¨nh th√¨ th√¨ chu·ªôt n√†o m√¨nh c≈©ng ch∆°i tu·ªët , ch·ªâ tr·ª´ 1 h√£ng ra : Razer . M√¨nh ƒëang s·ªü_h·ªØu 1 con DA black , x√†i ƒë∆∞·ª£c 6 th√°ng n√≥ b·ªã double click , ƒëem s·ª≠a xong x√†i ƒë∆∞·ª£c them 2 th√°ng n·ªØa n√≥ b·ªã h∆∞ n√∫t cu·ªôn ... Trong khi con SS_Sensei m√¨nh x√†i 3 nƒÉm m·ªõi b·ªã double click v√† r√≠t n√∫t cu·ªôn .
Label: 0 - negative
-----
Text: Xai chuot so nhat bi double_click .
Label: 1 - neutral
-----


In [None]:
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset = SentimentDataset(texts_train, labels_train, tokenizer)
val_dataset   = SentimentDataset(texts_val, labels_val, tokenizer)
test_dataset  = SentimentDataset(texts_test, labels_test, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.684861,0.709804,0.706344
2,0.782600,0.603152,0.754902,0.753411
3,0.782600,0.637875,0.752941,0.749194
4,0.463100,0.610569,0.772549,0.774113
5,0.463100,0.698304,0.752941,0.748227
6,0.326400,0.705755,0.759804,0.758483


TrainOutput(global_step=1530, training_loss=0.5193164943869597, metrics={'train_runtime': 686.9547, 'train_samples_per_second': 59.393, 'train_steps_per_second': 3.712, 'total_flos': 1610254116495360.0, 'train_loss': 0.5193164943869597, 'epoch': 6.0})

In [None]:
results = trainer.evaluate(eval_dataset=test_dataset)
print("K·∫øt qu·∫£ ƒë√°nh gi√° tr√™n t·∫≠p test:")
print(results)

K·∫øt qu·∫£ ƒë√°nh gi√° tr√™n t·∫≠p test:
{'eval_loss': 0.8318629264831543, 'eval_accuracy': 0.7095238095238096, 'eval_f1': 0.709959920505598, 'eval_runtime': 7.0442, 'eval_samples_per_second': 149.06, 'eval_steps_per_second': 9.369, 'epoch': 6.0}
