In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, classification_report
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from typing import *
import json
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

In [2]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class BookReviews(Dataset):
    def __init__(self, path: str, tokenizer, max_length=512):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Load data
        data = pd.read_json(path, orient="records", lines=True)
        self.sentences = data["review"].tolist()
        self.labels = data["sentiment"].tolist()

        # Create label-to-index mapping
        labels_unique = set(self.labels)
        self.labels_to_idx = {label: i for i, label in enumerate(labels_unique)}

    def __getitem__(self, idx):
        # Tokenize sentence
        encoded = self.tokenizer(
            self.sentences[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Get label
        label = self.labels_to_idx[self.labels[idx]]
        
        # Return dictionary
        return {
            "input_ids": encoded["input_ids"].squeeze(0),  # Remove batch dimension
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)  # Change 'label' to 'labels'
        }

    
    def __len__(self):
        return len(self.sentences)
    
    def num_labels(self):
        return len(self.labels_to_idx)

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load pre-trained model and tokenizer
model_name = "5CD-AI/Vietnamese-Sentiment-visobert"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/390M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/471k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [4]:
train_dataset = BookReviews(
    path='/kaggle/input/new-data/new_data/train.json',
    tokenizer=tokenizer
)
test_dataset = BookReviews(
    path='/kaggle/input/new-data/new_data/test.json',
    tokenizer=tokenizer
)

In [5]:
from torch.utils.data import DataLoader

batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [6]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)  # (batch_size, max_length)
    print(batch["attention_mask"].shape)  # (batch_size, max_length)
    print(batch["labels"].shape)  # (batch_size,)
    break

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


In [7]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Scheduler
num_training_steps = len(train_dataloader) * 1  # 5 epochs
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=2,
    num_training_steps=num_training_steps
)
num_epochs = 5



In [8]:
import torch
from tqdm import tqdm
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_epochs = 5
train_metrics = []
val_metrics = []
best_val_accuracy = 0  # Initialize best validation accuracy

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct_train = 0
    total_train_samples = 0
    loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    
    for batch in loop:
        # Move batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        # Compute training accuracy
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct_train += (predictions == batch["labels"]).sum().item()
        total_train_samples += batch["labels"].size(0)
        
        # Update progress bar
        loop.set_postfix(loss=loss.item())
    
    train_accuracy = total_correct_train / total_train_samples
    avg_train_loss = total_loss / len(train_dataloader)
    
    # Evaluate on dev set
    model.eval()
    total_correct_val = 0
    total_val_samples = 0
    total_val_loss = 0
    
    with torch.no_grad():
        for batch in dev_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_correct_val += (predictions == batch["labels"]).sum().item()
            total_val_samples += batch["labels"].size(0)
    
    val_accuracy = total_correct_val / total_val_samples
    avg_val_loss = total_val_loss / len(dev_dataloader)
    
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Loss: {avg_val_loss}")
    
    # Save metrics for each epoch
    train_metrics.append({'epoch': epoch+1, 'train_loss': avg_train_loss, 'train_accuracy': train_accuracy})
    val_metrics.append({'epoch': epoch+1, 'val_loss': avg_val_loss, 'val_accuracy': val_accuracy})
    
    # Save model if current validation accuracy is higher than best
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_metrics': train_metrics,
            'val_metrics': val_metrics
        }, 'best_model_checkpoint.pt')
        print(f"New best model saved at epoch {epoch+1} with validation accuracy: {val_accuracy}")

Epoch 1: 100%|██████████| 1744/1744 [13:02<00:00,  2.23it/s, loss=0.418] 


Epoch 1, Train Loss: 0.5798992055058138, Train Accuracy: 0.7505555157336392
Validation Accuracy: 0.7841788478073947, Validation Loss: 0.5149136363641621
New best model saved at epoch 1 with validation accuracy: 0.7841788478073947


Epoch 2: 100%|██████████| 1744/1744 [13:03<00:00,  2.23it/s, loss=0.192] 


Epoch 2, Train Loss: 0.38201687637921594, Train Accuracy: 0.8614436241129668
Validation Accuracy: 0.7841788478073947, Validation Loss: 0.5149136363641621


Epoch 3: 100%|██████████| 1744/1744 [13:04<00:00,  2.22it/s, loss=0.447] 


Epoch 3, Train Loss: 0.38344162916380126, Train Accuracy: 0.8620887391584833
Validation Accuracy: 0.7841788478073947, Validation Loss: 0.5149136363641621


Epoch 4: 100%|██████████| 1744/1744 [13:04<00:00,  2.22it/s, loss=0.145] 


Epoch 4, Train Loss: 0.38560433373273856, Train Accuracy: 0.8593649200774138
Validation Accuracy: 0.7841788478073947, Validation Loss: 0.5149136363641621


Epoch 5: 100%|██████████| 1744/1744 [13:04<00:00,  2.22it/s, loss=0.141] 


Epoch 5, Train Loss: 0.38433531867212045, Train Accuracy: 0.8583614077843882
Validation Accuracy: 0.7841788478073947, Validation Loss: 0.5149136363641621


In [9]:
import torch
from sklearn.metrics import precision_recall_fscore_support, classification_report

model.eval()
total_correct_val = 0
total_val_samples = 0
total_val_loss = 0

# Lists to store all true labels and predictions for calculating metrics
all_true_labels = []
all_predictions = []

with torch.no_grad():
    for batch in dev_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_val_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct_val += (predictions == batch["labels"]).sum().item()
        total_val_samples += batch["labels"].size(0)

        # Store true labels and predictions
        all_true_labels.extend(batch["labels"].cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

val_accuracy = total_correct_val / total_val_samples
avg_val_loss = total_val_loss / len(dev_dataloader)

# Calculate precision, recall, and F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average='weighted')

# Generate classification report
class_report = classification_report(all_true_labels, all_predictions)

print(f"Validation Accuracy: {val_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}")
print("\nClassification Report:")
print(class_report)


Validation Accuracy: 0.7842, Validation Loss: 0.5149
Precision: 0.7804, Recall: 0.7842, F1-Score: 0.7817

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.78      0.78       795
           1       0.84      0.88      0.86      1736
           2       0.68      0.62      0.65       958

    accuracy                           0.78      3489
   macro avg       0.76      0.76      0.76      3489
weighted avg       0.78      0.78      0.78      3489

