In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.11.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[h

In [2]:
from datasets import load_dataset
dataset = load_dataset("uit-nlp/vietnamese_students_feedback") # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8573337


Downloading data: 100%|██████████| 898k/898k [00:00<00:00, 3.30MB/s]
Downloading data: 100%|██████████| 22.9k/22.9k [00:00<00:00, 16.4MB/s]
Downloading data: 100%|██████████| 22.9k/22.9k [00:00<00:00, 11.7MB/s]
Downloading data: 100%|██████████| 119k/119k [00:00<00:00, 30.5MB/s]
Downloading data: 100%|██████████| 3.17k/3.17k [00:00<00:00, 2.40MB/s]
Downloading data: 100%|██████████| 3.17k/3.17k [00:00<00:00, 1.40MB/s]
Downloading data: 100%|██████████| 248k/248k [00:00<00:00, 26.3MB/s]
Downloading data: 100%|██████████| 6.33k/6.33k [00:00<00:00, 4.45MB/s]
Downloading data: 100%|██████████| 6.33k/6.33k [00:00<00:00, 4.55MB/s]
Generating train split: 11426 examples [00:00, 48932.72 examples/s]
Generating validation split: 1583 examples [00:00, 53961.49 examples/s]
Generating test split: 3166 examples [00:00, 51801.12 examples/s]


In [5]:
dataset['train'][0]

{'sentence': 'slide giáo trình đầy đủ .', 'sentiment': 2, 'topic': 1}

In [7]:
import pandas as pd
train_df = dataset['train'].to_pandas()
val_df = dataset['validation'].to_pandas()
test_df = dataset['test'].to_pandas()

# Concatenate all DataFrames into one (so data is more objective and balance)
df = pd.concat([train_df, val_df, test_df], ignore_index=True)

# Randomly mix/shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)
df.rename(columns = {'sentence': 'content', 'sentiment': 'label'}, inplace = True)

In [10]:
texts, labels = load_vietnamese_sa_dataset('/home/elo/.cache/kagglehub/datasets/linhlpv/vietnamese-sentiment-analyst/versions/2/data.csv')


Loaded dataset with 31460 samples.
Label distribution: [ 6669  4698 20093]


In [11]:
labels

array([2, 2, 0, ..., 2, 2, 2])

In [12]:

df_additional = pd.DataFrame({
    'content': texts,
    'label': labels
})

# Remove rows where the label could not be mapped (i.e., 'label' is None).
# This handles cases where the original rating was not in [1, 2, 3, 4, 5].
df_additional.dropna(subset=['label'], inplace=True)

df_additional['label'] = df_additional['label'].astype(int)

df = pd.concat([df, df_additional], ignore_index=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)



In [14]:
df = df.drop(columns=['topic'])

In [15]:
df

Unnamed: 0,content,label
0,cô cho thực hành rất nhiều .,2
1,Áo chất cứng,0
2,Hàng y hình,1
3,"Quần giống hình nhưng ko có phần chọn size, sh...",1
4,thầy rất dễ chịu .,2
...,...,...
47630,"cô rất thương sinh viên , hầu như tất cả sinh ...",2
47631,Shop phục vụ rất tốt Rất đáng tiền,2
47632,Thời gian giao hàng rất nhanh,2
47633,hy vọng ý kiến của em được xem xét !,0


In [34]:
# Define the path for the output CSV file
output_csv_path = 'sentiment_analysis_dataset.csv'

df.to_csv(output_csv_path, index=False, encoding='utf-8')

print(f"DataFrame saved to {output_csv_path}")


DataFrame saved to sentiment_analysis_dataset.csv


In [9]:
# Vietnamese BERT Models Evaluation Notebook
# Filename: vietnamese_bert_evaluation.ipynb

# Cell 1: Imports
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

# Explanation for the AdamW import change:
# The error "cannot import name 'AdamW' from 'transformers'" occurs because
# the AdamW optimizer is no longer available directly under the 'transformers'
# top-level package in recent versions of the Hugging Face transformers library.
# It was deprecated and then removed.
#
# The recommended way to import AdamW is now directly from 'torch.optim'.
# The 'get_linear_schedule_with_warmup' function, however, remains correctly
# imported from 'transformers'.
#
# Old import line (causing the error):
# from transformers import AdamW, get_linear_schedule_with_warmup
#
# Corrected imports:
from torch.optim import AdamW  # Import AdamW from torch.optim
from transformers import get_linear_schedule_with_warmup # This import is still correct

from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
from underthesea import word_tokenize
import emoji
# For Vietnamese text processing
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Cell 2: Load Vietnamese Stopwords
try:
    with open('/data/elo/khanglg/FreeTxt-Flask/vietnamese-stopwords.txt', 'r', encoding='utf-8') as f:
        vi_stopwords = [line.strip() for line in f if line.strip()]
    print(f"Successfully loaded {len(vi_stopwords)} Vietnamese stopwords.")
except FileNotFoundError:
    print("Vietnamese stopwords file not found. Please check the path.")
    vi_stopwords = []

# Define punctuation
PUNCS = '''!→()-[]{};:'"\,<>?@#$%^&*_~'''

# Cell 3: Device Configuration
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def normalize_unicode(text):
    return unicodedata.normalize('NFC', text)

# Cell 4: Text Preprocessing Function
# Old preprocess_text function
# def preprocess_text(text, language='vi'):
#     """
#     Preprocesses text for sentiment analysis:
#     - Converts input to string to handle potential NaNs or other types
#     - Removes URLs, mentions, hashtags
#     - Removes punctuation
#     - Converts to lowercase
#     - Removes stopwords
#     """
#     text = str(text) # Convert text to string to prevent TypeError
#     text = re.sub(r"http\\S+|@\\S+|#\\S+", "", text)
#     text = re.sub(f"[{re.escape(''.join(PUNCS))}]", "", text.lower())
#     text = " ".join(word for word in text.split() if word not in vi_stopwords)
#     return text
def preprocess_text(text, language='vi'): # Keep the name consistent if other cells call it
    """
    Improved preprocessing for text, especially Vietnamese:
    - Unicode normalization (NFC)
    - URL, mention, hashtag removal
    - Word segmentation (using underthesea for Vietnamese)
    - Stopword removal on token list
    - Punctuation removal
    - Whitespace normalization
    """
    text = str(text)
    text = normalize_unicode(text) # Normalize Unicode first
    text = emoji.demojize(text)
    # Remove URLs, mentions, and hashtags before word segmentation
    # as they might interfere or be wrongly segmented.
    text = re.sub(r"http\S+|@\S+|#\S+", "", text)
    
    if language == 'vi':
        # Tokenize to list for better stopword filtering and processing
        tokens = word_tokenize(text, format="list") # Returns list of words
    else:
        # For other languages, or if underthesea is not to be used for them,
        # split by space. BERT tokenizers will handle subwords.
        tokens = text.split()

    # Lowercase, filter stopwords, and keep only tokens with word characters
    # (helps remove leftover punctuation tokens before final PUNCS removal)
    processed_tokens = []
    for t in tokens:
        t_lower = t.lower()
        # Check if it's a stopword (only for Vietnamese in this specific path)
        is_stopword = (language == 'vi' and t_lower in vi_stopwords)
        
        # Keep token if it's not a stopword AND contains at least one word character
        # re.search(r'\w', t_lower) checks if there is any alphanumeric character.
        if not is_stopword and re.search(r'\w', t_lower):
            processed_tokens.append(t_lower)
            
    # Rejoin tokens
    processed_text = " ".join(processed_tokens)
    processed_text = re.sub(f"[{re.escape(''.join(PUNCS))}]", "", processed_text) # Apply to string
    
    # Normalize extra whitespace that might have been introduced or left over
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    
    return processed_text


# Cell 5: Load and Prepare Dataset
# Note: You'll need to replace this with your actual dataset loading code
def load_vietnamese_sa_dataset(file_path, has_header=True):
    """
    Loads a Vietnamese sentiment analysis dataset.
    Expected format: CSV/TSV with text and label columns
    Returns: texts, labels
    """
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith('.tsv'):
            df = pd.read_csv(file_path, sep='\t')
        else:
            raise ValueError("File format not supported. Please use CSV or TSV.")
        
        
        # Adapt these column names to match your dataset
        texts = df['content'].values  # Replace 'text' with your actual column name
        labels = df['label'].values  # Replace 'label' with your actual column name
        # Ensure labels are consistently processed and converted to integers.
        
        # Updated mapping to handle 'POS', 'NEU', 'NEG' labels.
        # We'll map them to 0 (Negative), 1 (Neutral), 2 (Positive).
        # Consider converting labels to a consistent case (e.g., uppercase) before mapping.
        current_label_mapping = {
            'NEG': 0,  # Negative
            'NEU': 1,  # Neutral
            'POS': 2   # Positive
        }

        processed_labels = []
        # Iterate through labels, convert to string, strip whitespace, convert to uppercase, then map.
        for label_val in labels: # labels is initially df['label'].values
            # Convert to string, strip leading/trailing whitespace, and convert to uppercase
            # to handle potential case variations (e.g., 'pos', 'Pos', 'POS').
            s_label = str(label_val).strip().upper()
            
            mapped_value = current_label_mapping.get(s_label)
            
            if mapped_value is not None:
                processed_labels.append(mapped_value)
            else:
                # If not in mapping, it's an unhandled label.
                # The original code also had a try-except for direct int conversion,
                # but with 'POS', 'NEU', 'NEG', this path is unlikely to be useful
                # unless the dataset mixes these string labels with numeric ones.
                # For clarity and to strictly adhere to the new requirement,
                # we will raise an error if a label is not in the defined mapping.
                raise ValueError(
                    f"Label '{s_label}' (derived from original value: '{label_val}') is not in the "
                    f"defined mapping {current_label_mapping}. "
                    f"Please check your dataset or update the label mapping. "
                    f"Expected labels are 'POS', 'NEU', 'NEG' (case-insensitive)."
                )
        
        # Convert the list of processed labels to a NumPy array of int64
        labels = np.array(processed_labels, dtype=np.int64)
        
        # Map labels if needed (example mapping)
        # Assuming labels are: negative=0, neutral=1, positive=2
        # If your dataset uses different values, adjust this mapping
        label_mapping = {'0': 0, '1': 1, '2': 2}  # Adjust as needed
        if isinstance(labels[0], str):
            labels = np.array([label_mapping.get(label, label) for label in labels])
        
        print(f"Loaded dataset with {len(texts)} samples.")
        print(f"Label distribution: {np.bincount(labels)}")
        
        return texts, labels
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return [], []

# Sample usage
# texts, labels = load_vietnamese_sa_dataset('path/to/your/vietnamese_sentiment_dataset.csv')

# Cell 6: Load and Compare Models
def load_model(model_name, num_labels=3):
    """
    Loads a pretrained model and tokenizer, and adapts it for sentiment classification.
    Args:
        model_name: HuggingFace model name/path
        num_labels: Number of sentiment classes (3 for negative/neutral/positive)
    Returns:
        tokenizer, model
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # If the model already has a classification head, use AutoModelForSequenceClassification
        try:
            model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        except:
            # If it's a base model without classification head, load base model and add classification layer
            base_model = AutoModel.from_pretrained(model_name)
            
            # Create a simple classification head
            class SentimentClassifier(torch.nn.Module):
                def __init__(self, base_model, num_labels):
                    super(SentimentClassifier, self).__init__()
                    self.base_model = base_model
                    self.dropout = torch.nn.Dropout(0.1)
                    self.classifier = torch.nn.Linear(base_model.config.hidden_size, num_labels)
                
                def forward(self, input_ids, attention_mask):
                    outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
                    pooled_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
                    pooled_output = self.dropout(pooled_output)
                    logits = self.classifier(pooled_output)
                    return logits
            
            model = SentimentClassifier(base_model, num_labels)
        
        model.to(device)
        return tokenizer, model
    
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        return None, None

# # Load VisoBERT and PhoBERT
# print("Loading VisoBERT...")
# viso_tokenizer, viso_model = load_model("binhvq/visoBERT-base")

# print("Loading PhoBERT...")
# pho_tokenizer, pho_model = load_model("vinai/phobert-base")

# Cell 7: Prepare data for models
def prepare_data_for_model(texts, labels, tokenizer, max_length=128, batch_size=16):
    """
    Converts texts and labels into PyTorch dataset and dataloaders.
    """
    # Preprocess the texts
    preprocessed_texts = [preprocess_text(text) for text in texts]
    
    # Tokenize texts
    encodings = tokenizer(
        preprocessed_texts,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    # Create PyTorch dataset
    dataset = TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask'],
        torch.tensor(labels)
    )
    
    # Split into train/validation
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        batch_size=batch_size
    )
    
    val_dataloader = DataLoader(
        val_dataset,
        sampler=SequentialSampler(val_dataset),
        batch_size=batch_size
    )
    
    return train_dataloader, val_dataloader

# Cell 8: Training and Evaluation Functions
def train_model(model, train_dataloader, val_dataloader, epochs=3):
    """
    Trains the model and evaluates after each epoch.
    """
    # Set up optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Training loop
    best_val_accuracy = 0
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        print(f'Epoch {epoch+1}/{epochs}')
        
        for batch in tqdm(train_dataloader):
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            labels = batch[2]
            
            optimizer.zero_grad()
            
            # Forward pass
            if isinstance(model, torch.nn.Module) and not hasattr(model, 'config'):
                logits = model(**inputs)
                loss_fn = torch.nn.CrossEntropyLoss()
                loss = loss_fn(logits, labels)
            else:
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss}")
        
        # Validation
        model.eval()
        val_accuracy, val_report = evaluate_model(model, val_dataloader)
        print(f"Validation Accuracy: {val_accuracy}")
        print(f"Classification Report:\n{val_report}")
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            # You can save the model here if needed
    
    return model

def evaluate_model(model, dataloader):
    """
    Evaluates the model and returns accuracy and classification report.
    """
    model.eval()
    predictions = []
    true_labels = []
    
    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1]
        }
        labels = batch[2]
        
        with torch.no_grad():
            if isinstance(model, torch.nn.Module) and not hasattr(model, 'config'):
                logits = model(**inputs)
            else:
                outputs = model(**inputs)
                logits = outputs.logits
        
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        pred_labels = labels.cpu().numpy()
        
        predictions.extend(preds)
        true_labels.extend(pred_labels)
    
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    
    return accuracy, report


# print(pho_report)

# Cell 10: Inference Example
def predict_sentiment(text, tokenizer, model, preprocess=True):
    """
    Predicts the sentiment of a single text.
    Returns the predicted class and confidence.
    """
    if preprocess:
        text = preprocess_text(text)
    
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        if isinstance(model, torch.nn.Module) and not hasattr(model, 'config'):
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
        else:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
    
    probabilities = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()[0]
    predicted_class = np.argmax(probabilities)
    confidence = probabilities[predicted_class]
    
    # Map class to sentiment
    sentiment_map = {0: 'Tiêu cực', 1: 'Trung tính', 2: 'Tích cực'}
    predicted_sentiment = sentiment_map.get(predicted_class, 'Unknown')
    
    return predicted_sentiment, confidence, probabilities



Successfully loaded 1942 Vietnamese stopwords.
Using device: cuda:1


In [26]:
# Cell: Additional Imports and Configuration


from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os # Already imported, but good to remember it's used

# --- Choose your model and define parameters ---
# You can cycle through these by uncommenting the one you want to use
MODEL_NAME = "bkai-foundation-models/vietnamese-bi-encoder"
# MODEL_NAME = "vinai/phobert-base-v2"
# MODEL_NAME = "microsoft/Multilingual-MiniLM-L12-H384"

# Ensure the NUM_LABELS matches your dataset's label scheme (0, 1, 2 for negative, neutral, positive)
NUM_LABELS = df['label'].nunique() # Dynamically get number of unique labels
if NUM_LABELS != 3: # Or your expected number of classes
    print(f"WARNING: Expected 3 labels, but found {NUM_LABELS}. Check your 'label' column.")
    # You might want to assert this or handle it if NUM_LABELS is not what you expect for SA.
    # For now, we'll proceed with the detected number of labels.

# --- Preprocessing & Tokenization ---
# MAX_SEQ_LENGTH can be adjusted based on your data and model
MAX_SEQ_LENGTH = 128

# --- Training Configuration ---
# Constructing output directory name based on the chosen model
model_identifier = MODEL_NAME.split('/')[-1] # e.g., 'vietnamese-bi-encoder'
OUTPUT_DIR = "FreeTxt-Flask/results_sa_finetune_{model_identifier}"
LOGGING_DIR = "FreeTxt-Flask/logs_sa_finetune_{model_identifier}"

NUM_TRAIN_EPOCHS = 5
LEARNING_RATE = 2e-5
PER_DEVICE_TRAIN_BATCH_SIZE = 16 # Adjust based on GPU memory
PER_DEVICE_EVAL_BATCH_SIZE = 16  # Adjust based on GPU memory
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
EVALUATION_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
LOAD_BEST_MODEL_AT_END = True
METRIC_FOR_BEST_MODEL = "f1" # or "accuracy"
SEED = 42

# --- Environment ---
# Ensure device is set (it's likely already set in your notebook, e.g., to cuda:1 or cuda:3)
# If not, uncomment and adjust:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Fine-tuning will use device: {device}") # Print the device being used from earlier cell
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"Selected model for fine-tuning: {MODEL_NAME}")
print(f"Number of labels to be used: {NUM_LABELS}")
print(f"Output directory: {OUTPUT_DIR}")

Fine-tuning will use device: cuda:1
Selected model for fine-tuning: bkai-foundation-models/vietnamese-bi-encoder
Number of labels to be used: 3
Output directory: FreeTxt-Flask/results_sa_finetune_{model_identifier}


In [20]:
texts_for_finetuning = df['content'].tolist()
labels_for_finetuning = df['label'].astype(int).tolist()

In [22]:
print(f"Number of texts for fine-tuning: {len(texts_for_finetuning)}")
print(f"Number of labels for fine-tuning: {len(labels_for_finetuning)}")
print(f"Sample text: {texts_for_finetuning[0]}")
print(f"Sample label: {labels_for_finetuning[0]}")

Number of texts for fine-tuning: 47635
Number of labels for fine-tuning: 47635
Sample text: cô cho thực hành rất nhiều .
Sample label: 2


In [25]:
print(f"\nLoading base model {MODEL_NAME} for fine-tuning...")
bkai_tokenizer, bkai_model = load_model("bkai-foundation-models/vietnamese-bi-encoder")


Loading base model bkai-foundation-models/vietnamese-bi-encoder for fine-tuning...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bkai-foundation-models/vietnamese-bi-encoder and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
print("\nPreparing training and validation dataloaders using your 'prepare_data_for_model'...")
train_dataloader_ft, val_dataloader_ft = prepare_data_for_model(
    texts_for_finetuning,
    labels_for_finetuning,
    bkai_tokenizer,
    max_length=MAX_SEQ_LENGTH,
    batch_size=PER_DEVICE_TRAIN_BATCH_SIZE
)

print(f"Number of batches in train_dataloader_ft: {len(train_dataloader_ft)}")
print(f"Number of batches in val_dataloader_ft: {len(val_dataloader_ft)}")


Preparing training and validation dataloaders using your 'prepare_data_for_model'...
Number of batches in train_dataloader_ft: 2382
Number of batches in val_dataloader_ft: 596


In [28]:
NUM_TRAIN_EPOCHS = 5 
print(f"\nStarting fine-tuning of {MODEL_NAME} for {NUM_TRAIN_EPOCHS} epochs...")
    # Your train_model function returns the trained model
fine_tuned_model_manual = train_model(
        bkai_model, # Pass the newly loaded model instance
        train_dataloader_ft,
        val_dataloader_ft,
        epochs=NUM_TRAIN_EPOCHS
    )
print("Fine-tuning finished.")


Starting fine-tuning of bkai-foundation-models/vietnamese-bi-encoder for 5 epochs...
Epoch 1/5


100%|██████████| 2382/2382 [10:30<00:00,  3.78it/s]


Average training loss: 0.5758565687687831
Validation Accuracy: 0.7836674713970819
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.72      0.74      2849
           1       0.48      0.14      0.22      1082
           2       0.81      0.94      0.87      5596

    accuracy                           0.78      9527
   macro avg       0.69      0.60      0.61      9527
weighted avg       0.76      0.78      0.76      9527

Epoch 2/5


100%|██████████| 2382/2382 [10:58<00:00,  3.62it/s]


Average training loss: 0.4889420593011965
Validation Accuracy: 0.7974178650152199
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.78      0.77      2849
           1       0.44      0.24      0.31      1082
           2       0.85      0.92      0.88      5596

    accuracy                           0.80      9527
   macro avg       0.68      0.64      0.65      9527
weighted avg       0.78      0.80      0.78      9527

Epoch 3/5


 74%|███████▍  | 1766/2382 [09:18<03:14,  3.16it/s]


KeyboardInterrupt: 

In [29]:
bkai_accuracy, bkai_report = evaluate_model(fine_tuned_model_manual, val_dataloader_ft)

NameError: name 'fine_tuned_model_manual' is not defined

In [25]:
# Evaluate BKAI
print("Evaluating BKAI Vietnamese Bi-Encoder...")
bkai_train_dataloader, bkai_val_dataloader = prepare_data_for_model(texts, labels, bkai_tokenizer)
bkai_model = train_model(bkai_model, bkai_train_dataloader, bkai_val_dataloader)
bkai_accuracy, bkai_report = evaluate_model(bkai_model, bkai_val_dataloader)

Evaluating BKAI Vietnamese Bi-Encoder...
Epoch 1/3


100%|██████████| 1573/1573 [03:58<00:00,  6.61it/s]


Average training loss: 0.5936510810902068
Validation Accuracy: 0.7786077558804831
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.63      0.69      1383
           1       0.39      0.23      0.29       904
           2       0.83      0.95      0.89      4005

    accuracy                           0.78      6292
   macro avg       0.66      0.61      0.62      6292
weighted avg       0.75      0.78      0.76      6292

Epoch 2/3


100%|██████████| 1573/1573 [03:59<00:00,  6.58it/s]


Average training loss: 0.5048919593073423
Validation Accuracy: 0.7905276541640178
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.74      0.73      1383
           1       0.40      0.28      0.33       904
           2       0.87      0.92      0.90      4005

    accuracy                           0.79      6292
   macro avg       0.67      0.65      0.65      6292
weighted avg       0.77      0.79      0.78      6292

Epoch 3/3


100%|██████████| 1573/1573 [03:57<00:00,  6.63it/s]


Average training loss: 0.44505883378165584
Validation Accuracy: 0.7863954227590592
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.71      0.73      1383
           1       0.39      0.29      0.33       904
           2       0.86      0.93      0.89      4005

    accuracy                           0.79      6292
   macro avg       0.66      0.64      0.65      6292
weighted avg       0.77      0.79      0.78      6292



In [26]:
sample_text = "Món ăn này thực sự rất ngon, tôi rất thích nó!"
bkai_sentiment, bkai_conf, bkai_probs = predict_sentiment(sample_text, bkai_tokenizer, bkai_model)
# pho_sentiment, pho_conf, pho_probs = predict_sentiment(sample_text, pho_tokenizer, pho_model)

print(f"Sample text: '{sample_text}'")
print(f"BKAI prediction: {bkai_sentiment} (confidence: {bkai_conf:.4f})")

Sample text: 'Món ăn này thực sự rất ngon, tôi rất thích nó!'
BKAI prediction: Tích cực (confidence: 0.9734)
