In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, classification_report
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from typing import *
import json
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

In [16]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class BookReviews(Dataset):
    def __init__(self, path: str, tokenizer, max_length=256):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Load data
        data = pd.read_json(path, orient="records", lines=True)
        self.sentences = data["review"].tolist()
        self.labels = data["sentiment"].tolist()

        # Create label-to-index mapping
        labels_unique = set(self.labels)
        self.labels_to_idx = {label: i for i, label in enumerate(labels_unique)}

    def __getitem__(self, idx):
        # Tokenize sentence
        encoded = self.tokenizer(
            self.sentences[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Get label
        label = self.labels_to_idx[self.labels[idx]]
        
        # Return dictionary
        return {
            "input_ids": encoded["input_ids"].squeeze(0),  # Remove batch dimension
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)  # Change 'label' to 'labels'
        }

    
    def __len__(self):
        return len(self.sentences)
    
    def num_labels(self):
        return len(self.labels_to_idx)

In [17]:
from transformers import RobertaForSequenceClassification, AutoTokenizer

model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")

tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)


In [18]:
train_dataset = BookReviews(
    path='/kaggle/input/new-data/new_data/train.json',
    tokenizer=tokenizer
)
test_dataset = BookReviews(
    path='/kaggle/input/new-data/new_data/test.json',
    tokenizer=tokenizer
)

In [19]:
from torch.utils.data import DataLoader

batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [20]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)  # (batch_size, max_length)
    print(batch["attention_mask"].shape)  # (batch_size, max_length)
    print(batch["labels"].shape)  # (batch_size,)
    break


torch.Size([8, 256])
torch.Size([8, 256])
torch.Size([8])


In [21]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Scheduler
num_training_steps = len(train_dataloader) * 5  # 5 epochs
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)



In [22]:
import torch
from tqdm import tqdm
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_epochs = 5
train_metrics = []
val_metrics = []
best_val_accuracy = 0  # Initialize best validation accuracy

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct_train = 0
    total_train_samples = 0
    loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    
    for batch in loop:
        # Move batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        # Compute training accuracy
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct_train += (predictions == batch["labels"]).sum().item()
        total_train_samples += batch["labels"].size(0)
        
        # Update progress bar
        loop.set_postfix(loss=loss.item())
    
    train_accuracy = total_correct_train / total_train_samples
    avg_train_loss = total_loss / len(train_dataloader)
    
    # Evaluate on dev set
    model.eval()
    total_correct_val = 0
    total_val_samples = 0
    total_val_loss = 0
    
    with torch.no_grad():
        for batch in dev_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_correct_val += (predictions == batch["labels"]).sum().item()
            total_val_samples += batch["labels"].size(0)
    
    val_accuracy = total_correct_val / total_val_samples
    avg_val_loss = total_val_loss / len(dev_dataloader)
    
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Loss: {avg_val_loss}")
    
    # Save metrics for each epoch
    train_metrics.append({'epoch': epoch+1, 'train_loss': avg_train_loss, 'train_accuracy': train_accuracy})
    val_metrics.append({'epoch': epoch+1, 'val_loss': avg_val_loss, 'val_accuracy': val_accuracy})
    
    # Save model if current validation accuracy is higher than best
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_metrics': train_metrics,
            'val_metrics': val_metrics
        }, 'best_model_checkpoint.pt')
        print(f"New best model saved at epoch {epoch+1} with validation accuracy: {val_accuracy}")



Epoch 1: 100%|██████████| 1744/1744 [06:26<00:00,  4.52it/s, loss=0.896]


Epoch 1, Train Loss: 0.7196490515211443, Train Accuracy: 0.6825317181564046
Validation Accuracy: 0.6752651189452565, Validation Loss: 0.7520538472508674
New best model saved at epoch 1 with validation accuracy: 0.6752651189452565


Epoch 2: 100%|██████████| 1744/1744 [06:24<00:00,  4.53it/s, loss=0.863] 


Epoch 2, Train Loss: 0.5362882591151726, Train Accuracy: 0.7737079779227296
Validation Accuracy: 0.6938950988822012, Validation Loss: 0.7167064750937903
New best model saved at epoch 2 with validation accuracy: 0.6938950988822012


Epoch 3: 100%|██████████| 1744/1744 [06:25<00:00,  4.53it/s, loss=0.84]  


Epoch 3, Train Loss: 0.37286811994298985, Train Accuracy: 0.8551358325568059
Validation Accuracy: 0.7082258526798509, Validation Loss: 0.8191769493432626
New best model saved at epoch 3 with validation accuracy: 0.7082258526798509


Epoch 4: 100%|██████████| 1744/1744 [06:24<00:00,  4.54it/s, loss=0.337]  


Epoch 4, Train Loss: 0.23881771753530376, Train Accuracy: 0.911332520966239
Validation Accuracy: 0.7211235310977357, Validation Loss: 0.9115095950334889
New best model saved at epoch 4 with validation accuracy: 0.7211235310977357


Epoch 5: 100%|██████████| 1744/1744 [06:24<00:00,  4.54it/s, loss=0.0965] 


Epoch 5, Train Loss: 0.14477273199836502, Train Accuracy: 0.9529066016772991
Validation Accuracy: 0.7176841501862998, Validation Loss: 1.048955521634748


Epoch 6: 100%|██████████| 1744/1744 [06:25<00:00,  4.53it/s, loss=0.0535] 


Epoch 6, Train Loss: 0.11711608170005151, Train Accuracy: 0.9631567629560605
Validation Accuracy: 0.7176841501862998, Validation Loss: 1.048955521634748


Epoch 7: 100%|██████████| 1744/1744 [06:25<00:00,  4.53it/s, loss=0.011]  


Epoch 7, Train Loss: 0.11787813600876855, Train Accuracy: 0.962941724607555
Validation Accuracy: 0.7176841501862998, Validation Loss: 1.048955521634748


Epoch 8: 100%|██████████| 1744/1744 [06:25<00:00,  4.53it/s, loss=0.0739] 


Epoch 8, Train Loss: 0.11747691536337189, Train Accuracy: 0.9627983657085514
Validation Accuracy: 0.7176841501862998, Validation Loss: 1.048955521634748


Epoch 9:   5%|▍         | 82/1744 [00:18<06:09,  4.50it/s, loss=0.019]  


KeyboardInterrupt: 

In [23]:
import torch
from sklearn.metrics import precision_recall_fscore_support, classification_report

model.eval()
total_correct_val = 0
total_val_samples = 0
total_val_loss = 0

# Lists to store all true labels and predictions for calculating metrics
all_true_labels = []
all_predictions = []

with torch.no_grad():
    for batch in dev_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_val_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct_val += (predictions == batch["labels"]).sum().item()
        total_val_samples += batch["labels"].size(0)

        # Store true labels and predictions
        all_true_labels.extend(batch["labels"].cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

val_accuracy = total_correct_val / total_val_samples
avg_val_loss = total_val_loss / len(dev_dataloader)

# Calculate precision, recall, and F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average='weighted')

# Generate classification report
class_report = classification_report(all_true_labels, all_predictions)

print(f"Validation Accuracy: {val_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}")
print("\nClassification Report:")
print(class_report)


Validation Accuracy: 0.7177, Validation Loss: 1.0490
Precision: 0.7106, Recall: 0.7177, F1-Score: 0.7119

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.63      0.68       795
           1       0.78      0.87      0.82      1736
           2       0.57      0.51      0.54       958

    accuracy                           0.72      3489
   macro avg       0.69      0.67      0.68      3489
weighted avg       0.71      0.72      0.71      3489

