In [9]:
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from sklearn.metrics import accuracy_score, classification_report
import re
import os
from tqdm import tqdm
import unicodedata
import string
from underthesea import word_tokenize

# Using unique names for globals in this cell to avoid conflicts
PUNCS_CELL_EVAL = '''!→()-[]{};:'"\\,<>?@#$%^&*_~'''
vi_stopwords_global_cell_eval = set()
device_global_cell_eval = torch.device("cpu") # Will be updated

# --- Helper Functions (Adapted from finetuneSA.py for evaluation in this notebook cell) ---

def preprocess_text_cell_eval(text):
    """
    Preprocesses text for sentiment analysis for this evaluation cell.
    Uses global vi_stopwords_global_cell_eval and PUNCS_CELL_EVAL.
    """
    global vi_stopwords_global_cell_eval, PUNCS_CELL_EVAL
    text = str(text)
    # Regex from finetuneSA.py: re.sub(r"http\\S+|@\\S+|#\\S+", "", text)
    # In a standard Python string, \\S becomes \S. This is equivalent to r"http\S+|@\S+|#\S+"
    text = re.sub(r"http\S+|@\S+|#\S+", "", text) 
    text = re.sub(f"[{re.escape(''.join(PUNCS_CELL_EVAL))}]", "", text.lower())
    text = " ".join(word for word in text.split() if word not in vi_stopwords_global_cell_eval)
    return text

# It's recommended to add these imports to the top of your notebook cell if not already present:
# import string
# import unicodedata
# from underthesea import word_tokenize
# pandas (pd) and re are already imported in the cell.

# --- Data structures and helper functions from Vi_preprocessing.ipynb (adapted for this cell) ---
# These are prefixed with _v2 or defined locally to avoid conflicts if similar names exist globally.

bang_nguyen_am_v2 = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau_v2 = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids_v2 = {}
for i in range(len(bang_nguyen_am_v2)):
    for j in range(len(bang_nguyen_am_v2[i]) - 1):
        nguyen_am_to_ids_v2[bang_nguyen_am_v2[i][j]] = (i, j)

def loaddicchar_v2():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split('|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split('|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic
dicchar_v2 = loaddicchar_v2()

# --- Load Teencode Data ---
_teencode_file_path_v2 = './teencode.txt' 
try:
    teencode_df_v2 = pd.read_csv(_teencode_file_path_v2, names=['teencode', 'map'], sep='\t', header=None)
    teencode_map_default_v2 = pd.Series(teencode_df_v2['map'].values, index=teencode_df_v2['teencode']).to_dict()
    print(f"Successfully loaded teencode map from {_teencode_file_path_v2}")
except FileNotFoundError:
    print(f"Warning: Teencode file not found at {_teencode_file_path_v2}. Teencode replacement will be limited for preprocess_text_cell_eval_v2.")
    teencode_map_default_v2 = {}
except Exception as e:
    print(f"Warning: Error loading teencode file '{_teencode_file_path_v2}': {e}. Teencode replacement will be limited for preprocess_text_cell_eval_v2.")
    teencode_map_default_v2 = {}

# --- Load Stopwords Data ---
_stopwords_file_path_v2 = '/data/elo/khanglg/FreeTxt-Flask/vietnamese-stopwords.txt'
try:
    with open(_stopwords_file_path_v2, 'r', encoding='utf-8') as f:
        stopwords_list_default_v2 = [line.strip() for line in f if line.strip()]
    print(f"Successfully loaded stopwords list from {_stopwords_file_path_v2}")
except FileNotFoundError:
    print(f"Warning: Stopwords file not found at {_stopwords_file_path_v2}. Stopword removal will be limited for preprocess_text_cell_eval_v2.")
    stopwords_list_default_v2 = []
except Exception as e:
    print(f"Warning: Error loading stopwords file '{_stopwords_file_path_v2}': {e}. Stopword removal will be limited for preprocess_text_cell_eval_v2.")
    stopwords_list_default_v2 = []

emoji_pattern_v2 = re.compile("["
    u"\U0001F600-\U0001F64F"  # Emoticons
    u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # Transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # Flags
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u200d"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\u3030"
    u"\ufe0f"
    "]+", flags=re.UNICODE)

def convert_unicode_legacy_v2(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar_v2[x.group()], txt)

def text_unicode_normalize_v2(text):
    # Requires: import unicodedata
    try:
        return unicodedata.normalize('NFC', text)
    except NameError:
        print("Warning: unicodedata module not imported. Unicode normalization (NFC) skipped.")
        return text

# def convert_unicode_legacy(txt): # Renamed to avoid clash if user defines convert_unicode
#     return re.sub(
#         r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
#         lambda x: dicchar[x.group()], txt)

# def text_unicode_normalize(text): # From user's snippet
#     return unicodedata.normalize('NFC', text)


def is_valid_vietnam_word_v2(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids_v2.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True

def chuan_hoa_dau_tu_tieng_viet_v2(word):
    if not is_valid_vietnam_word_v2(word):
        return word
    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids_v2.get(char, (-1, -1))
        if x == -1: continue
        if x == 9: # u
            if index > 0 and chars[index - 1].lower() == 'q':
                chars[index] = 'u'; qu_or_gi = True
        elif x == 5: # i
            if index > 0 and chars[index - 1].lower() == 'g':
                chars[index] = 'i'; qu_or_gi = True
        if y != 0:
            dau_cau = y; chars[index] = bang_nguyen_am_v2[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)

    if not nguyen_am_index: return "".join(chars)

    idx_to_mark = nguyen_am_index[0]
    if len(nguyen_am_index) >= 2:
        priority_vowel_found = False
        for idx_candidate in nguyen_am_index:
            x_vowel, _ = nguyen_am_to_ids_v2.get(chars[idx_candidate], (-1,-1))
            if x_vowel in [4, 7, 8]: # ê, ô, ơ
                idx_to_mark = idx_candidate
                priority_vowel_found = True
                break
        
        if not priority_vowel_found:
            if nguyen_am_index[-1] == len(chars) -1:
                x_last_vowel, _ = nguyen_am_to_ids_v2.get(chars[nguyen_am_index[-1]], (-1,-1))
                if x_last_vowel in [5, 9, 10, 11]: # i, u, ư, y
                     idx_to_mark = nguyen_am_index[-2] if len(nguyen_am_index) > 1 else nguyen_am_index[-1]
                else: 
                    idx_to_mark = nguyen_am_index[0]
            else: 
                if len(nguyen_am_index) == 3: 
                    idx_to_mark = nguyen_am_index[1]
                elif len(nguyen_am_index) == 2: 
                    idx_to_mark = nguyen_am_index[1]

    x_target_vowel, _ = nguyen_am_to_ids_v2.get(chars[idx_to_mark], (-1,-1))
    if x_target_vowel != -1 and dau_cau != 0:
        chars[idx_to_mark] = bang_nguyen_am_v2[x_target_vowel][dau_cau]
    return "".join(chars)

def chuan_hoa_dau_cau_tieng_viet_v2(sentence):
    words = sentence.split()
    for index, word in enumerate(words):
        match = re.match(r'(^[\W_]*)([\wÀ-Ỹà-ỹ._]*[\wÀ-Ỹà-ỹ]+)([\W_]*$)', word)
        if match:
            prefix, core_word, suffix = match.groups()
            normalized_core_word = chuan_hoa_dau_tu_tieng_viet_v2(core_word)
            words[index] = prefix + normalized_core_word + suffix
        else:
            words[index] = chuan_hoa_dau_tu_tieng_viet_v2(word) 
    return " ".join(words)

# --- Main Preprocessing Function V2 (Adapted from Vi_preprocessing.ipynb) ---
def preprocess_text_cell_eval_v2(
    text,
    custom_teencode_map=None,
    custom_stopwords_list=None,
    use_teencode=True,
    use_stopwords=False,
    remove_all_punctuation=False
    ):
    """
    Comprehensive Vietnamese text preprocessing using underthesea, adapted for notebook evaluation.
    Requires `string`, `unicodedata`, `underthesea` modules to be imported.
    """
    if not isinstance(text, str):
        text = str(text)

    current_teencode_map = custom_teencode_map if custom_teencode_map is not None else teencode_map_default_v2
    current_stopwords_list = custom_stopwords_list if custom_stopwords_list is not None else stopwords_list_default_v2

    # 1. Lowercase
    processed_text = text.lower()

    # 2. Remove URLs, mentions, hashtags
    processed_text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", processed_text)

    # 3. Legacy Unicode conversion
    processed_text = convert_unicode_legacy_v2(processed_text)

    # 4. Standard Unicode Normalization (NFC)
    processed_text = text_unicode_normalize_v2(processed_text)

    # 5. Remove Emojis
    processed_text = re.sub(emoji_pattern_v2, " ", processed_text)

    # 6. Reduce repeated alphabetic characters
    processed_text = re.sub(r'([a-zàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ])\1+', r'\1', processed_text)

    # 7. Reduce repeated special characters
    processed_text = re.sub(r'([^a-z0-9àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ\s])\1+', r'\1', processed_text)
    
    try:
        # Requires: import string
        _local_string_punctuation = string.punctuation
        _local_string_whitespace = string.whitespace
    except NameError:
        print("Warning: string module not imported. Using a basic punctuation set for steps 8, 9, 11, 13.")
        # Fallback punctuation similar to PUNCS_CELL_EVAL or finetuneSA.py's _punctuation_chars
        _local_string_punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        _local_string_whitespace = " \t\n\r\f\v"


    # 8. Normalize punctuation spacing
    escaped_punctuation = re.escape(_local_string_punctuation)
    processed_text = re.sub(r"(\w)\s*([" + escaped_punctuation + r"])\s*(\w)", r"\1 \2 \3", processed_text)
    processed_text = re.sub(r"(\w)\s*([" + escaped_punctuation + r"])", r"\1 \2", processed_text)
    processed_text = re.sub(r"([" + escaped_punctuation + r"])\s*(\w)", r"\1 \2", processed_text)

    # 9. Reduce repeated punctuation characters
    processed_text = re.sub(r"([" + escaped_punctuation + r"])\1+", r"\1", processed_text)

    # 10. Vietnamese tone mark normalization
    processed_text = chuan_hoa_dau_cau_tieng_viet_v2(processed_text)

    # 11. Remove all punctuation (optional)
    if remove_all_punctuation:
        translator = str.maketrans('', '', _local_string_punctuation)
        processed_text = processed_text.translate(translator)

    # 12. Final whitespace cleanup
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    
    # 13. Strip leading/trailing punctuation or space robustly
    if not remove_all_punctuation and processed_text:
        strip_chars = _local_string_punctuation + _local_string_whitespace
        while processed_text and processed_text[-1] in strip_chars:
            processed_text = processed_text[:-1]
        while processed_text and processed_text[0] in strip_chars:
            processed_text = processed_text[1:]
    
    if not processed_text:
        return ""

    # 14. Tokenization using underthesea
    try:
        # Requires: from underthesea import word_tokenize
        tokens = word_tokenize(processed_text, format="list") 
    except NameError:
        print("Warning: underthesea.word_tokenize not imported. Falling back to simple whitespace split for tokenization.")
        tokens = processed_text.split()


    # 15. Teencode Replacement (on tokens)
    if use_teencode and current_teencode_map:
        new_tokens = []
        for token in tokens:
            replacement = current_teencode_map.get(token, token)
            new_tokens.append(replacement)
        
        if any(" " in t for t in new_tokens):
            temp_token_string = " ".join(new_tokens)
            try:
                # Requires: from underthesea import word_tokenize
                tokens = word_tokenize(temp_token_string, format="list")
            except NameError:
                # Fallback if word_tokenize is not available after teencode
                tokens = temp_token_string.split()

        else:
            tokens = new_tokens

    # 16. Stopword Removal (on tokens)
    if use_stopwords and current_stopwords_list:
        tokens = [token for token in tokens if token not in current_stopwords_list and token.strip()]

    # 17. Join tokens to form the final processed string
    return " ".join(tokens)

# Example usage (optional, for testing in the notebook):
# test_text_v2 = "Chàoooo bạn, hôm nay trời đẹp quá!!! :))) #sunnyday @friend http://example.com"
# processed_v2 = preprocess_text_cell_eval_v2(test_text_v2, use_stopwords=True, use_teencode=True)
# print(f"Original: {test_text_v2}")
# print(f"Processed V2: {processed_v2}")

# test_text_v2_punctuation = "công ty abc .,. xin chào ! ! !"
# processed_v2_punc = preprocess_text_cell_eval_v2(test_text_v2_punctuation, remove_all_punctuation=True)
# print(f"Original: {test_text_v2_punctuation}")
# print(f"Processed V2 (remove all punc): {processed_v2_punc}")

# processed_v2_punc_keep = preprocess_text_cell_eval_v2(test_text_v2_punctuation, remove_all_punctuation=False)
# print(f"Original: {test_text_v2_punctuation}")
# print(f"Processed V2 (keep punc): {processed_v2_punc_keep}")


def load_model_cell_eval(model_path, num_labels):
    """
    Loads a fine-tuned model and tokenizer for evaluation in this cell.
    Uses global device_global_cell_eval.
    Assumes model was saved using `save_pretrained` and is loadable
    via `AutoModelForSequenceClassification.from_pretrained`.
    """
    global device_global_cell_eval
    try:
        print(f"Loading tokenizer from: {model_path}")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        print(f"Loading model from: {model_path}")
        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
        model.to(device_global_cell_eval)
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model {model_path} using AutoModelForSequenceClassification.from_pretrained: {e}")
        print("Please ensure the model was saved correctly using `save_pretrained` and is compatible.")
        return None, None

def prepare_data_for_cell_eval(texts, labels, tokenizer, max_length, batch_size, seed_for_split):
    """
    Converts texts and labels into a PyTorch DataLoader for the validation split.
    Uses preprocess_text_cell_eval.
    Splits data 80/20 for train/validation using the provided seed to ensure consistency.
    """
    print("Preprocessing texts for dataloaders...")
    #preprocessed_texts = [preprocess_text_cell_eval(text) for text in tqdm(texts, desc="Preprocessing")]
    preprocessed_texts = [preprocess_text_cell_eval_v2(text) for text in tqdm(texts, desc="Preprocessing V2")]

    
    print("Tokenizing texts...")
    encodings = tokenizer(
        preprocessed_texts,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    dataset = TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask'],
        torch.tensor(labels, dtype=torch.long)
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    
    if val_size == 0 and train_size > 0 : # Ensure val_size is at least 1 if dataset is very small
        if train_size > 1:
            train_size -=1
            val_size +=1
        else: # Cannot split if only 1 sample
            print("Warning: Dataset too small to create a validation split. Using entire dataset for validation.")
            val_dataset = dataset
    elif val_size == 0 and train_size == 0:
        print("Error: Dataset is empty after processing.")
        return None
    else:
         # Ensure reproducibility of split using the provided seed
        train_dataset, val_dataset = torch.utils.data.random_split(
            dataset, 
            [train_size, val_size],
            generator=torch.Generator().manual_seed(seed_for_split)
        )
    
    print(f"Using validation split for evaluation. Validation dataset size: {len(val_dataset)}")

    val_dataloader = DataLoader(
        val_dataset,
        sampler=SequentialSampler(val_dataset),
        batch_size=batch_size
    )
    
    return val_dataloader

def evaluate_model_cell_eval(model, dataloader):
    """
    Evaluates the model and returns accuracy and classification report.
    Uses global device_global_cell_eval.
    """
    global device_global_cell_eval
    model.eval()
    predictions_list = []
    true_labels_list = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch_input_ids = batch[0].to(device_global_cell_eval)
            batch_attention_mask = batch[1].to(device_global_cell_eval)
            batch_labels = batch[2].to(device_global_cell_eval)

            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits 
            
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels_np = batch_labels.cpu().numpy()
            
            predictions_list.extend(preds)
            true_labels_list.extend(labels_np)
            
    if not true_labels_list: # Handle empty dataloader or no predictions
        print("Warning: No predictions made, possibly due to an empty dataloader.")
        return 0.0, "No predictions to report."

    accuracy = accuracy_score(true_labels_list, predictions_list)
    try:
        unique_labels_in_data = sorted(list(set(true_labels_list)))
        
        # Determine target names based on unique labels present in the data and model config
        if model.config.num_labels == 3 and all(l in [0,1,2] for l in unique_labels_in_data) :
             # Standard 3-class sentiment
            target_names_map = {0: 'Tiêu cực', 1: 'Trung tính', 2: 'Tích cực'}
            current_target_names = [target_names_map[l] for l in unique_labels_in_data if l in target_names_map]
            # If not all labels [0,1,2] are in unique_labels_in_data, classification_report might still need full list
            # For safety, provide all expected names if num_labels is 3.
            if len(current_target_names) < 3 and len(unique_labels_in_data) <3 : # If only a subset of labels appeared
                 report_target_names = ['Tiêu cực', 'Trung tính', 'Tích cực'] # Full list for report
            else:
                 report_target_names = current_target_names

        else: # Generic case
            report_target_names = [f"class_{i}" for i in unique_labels_in_data]
            if not report_target_names : # Fallback if unique_labels_in_data is empty (should not happen if true_labels_list is not empty)
                 report_target_names = [f"class_{i}" for i in range(model.config.num_labels)]


        report = classification_report(true_labels_list, predictions_list, labels=unique_labels_in_data, target_names=report_target_names, zero_division=0)
    except ValueError as e: 
        print(f"Warning generating classification report: {e}. Using default report (no target names).")
        report = classification_report(true_labels_list, predictions_list, zero_division=0)

    return accuracy, report

# --- Configuration for Evaluation ---
DATASET_PATH_EVAL = "sentiment_analysis_dataset.csv"
STOPWORDS_PATH_EVAL = "vietnamese-stopwords.txt"
OUTPUT_DIR_BASE_EVAL = "./fine_tuned_sa_manual_models"
# MODEL_IDENTIFIERS_EVAL = ["./fine_tuned_sa_manual_models/vietnamese-bi-encoder", 
#                           "./fine_tuned_sa_manual_models/visobert", 
#                           "./fine_tuned_sa_manual_models/phobert-base-v2",
#                           "./fine_tuned_sa_manual_models/Multilingual-MiniLM-L12-H384"]
MODEL_IDENTIFIERS_EVAL = ["./fine_tuned_sa_manual_newPreprocess_models/vietnamese-bi-encoder", 
                          "./fine_tuned_sa_manual_newPreprocess_models/visobert", 
                          ]

NUM_LABELS_EVAL = 3
MAX_SEQ_LENGTH_EVAL = 128
BATCH_SIZE_EVAL = 24 
# Check if cuda:7 is available, otherwise fallback
if torch.cuda.is_available():
    try:
        if torch.cuda.device_count() > 1: # Check if device 7 exists
             torch.cuda.set_device(1) # Try to set to check
             DEVICE_STR_EVAL = "cuda:1"
        else: # cuda:7 not available, use cuda:0 or other available
             DEVICE_STR_EVAL = "cuda" 
    except RuntimeError: # If cuda:7 cannot be set
        DEVICE_STR_EVAL = "cuda" # Fallback to default cuda
else:
    DEVICE_STR_EVAL = "cpu"

SEED_EVAL = 42 # Consistent with finetuneSA.py for data splitting

# --- Setup ---
device_global_cell_eval = torch.device(DEVICE_STR_EVAL)
torch.manual_seed(SEED_EVAL)
np.random.seed(SEED_EVAL)
if DEVICE_STR_EVAL.startswith("cuda"):
    torch.cuda.manual_seed_all(SEED_EVAL)

print(f"Using device: {device_global_cell_eval}")

# Load stopwords for this cell
try:
    with open(STOPWORDS_PATH_EVAL, 'r', encoding='utf-8') as f:
        vi_stopwords_global_cell_eval = set([line.strip() for line in f if line.strip()])
    print(f"Successfully loaded {len(vi_stopwords_global_cell_eval)} Vietnamese stopwords from {STOPWORDS_PATH_EVAL}.")
except FileNotFoundError:
    print(f"Warning: Vietnamese stopwords file not found at {STOPWORDS_PATH_EVAL}. Proceeding without custom stopwords.")
    vi_stopwords_global_cell_eval = set()

# Load dataset
print(f"Loading dataset from: {DATASET_PATH_EVAL}")
eval_texts_list = []
eval_labels_list = []
try:
    df_eval_cell_full = pd.read_csv(DATASET_PATH_EVAL)
    if 'content' not in df_eval_cell_full.columns or 'label' not in df_eval_cell_full.columns:
        raise ValueError("Dataset CSV must contain 'content' and 'label' columns.")
    
    df_eval_cell_full['label'] = pd.to_numeric(df_eval_cell_full['label'], errors='coerce')
    df_eval_cell_full.dropna(subset=['content', 'label'], inplace=True) # Drop rows where content or label is NaN
    df_eval_cell_full['label'] = df_eval_cell_full['label'].astype(int) # Convert valid labels to int
    
    # Determine the split for evaluation to match the training validation set logic
    # The training script (finetuneSA.py) splits the full dataset 80/20 after creating a TensorDataset.
    # We replicate this split logic here to get the same validation set.
    # torch.manual_seed(SEED_EVAL) has already been called earlier in this cell.

    num_total_samples = len(df_eval_cell_full)
    
    if num_total_samples > 0:
        # Create a dummy dataset of indices. The split is based on these indices.
        all_indices_tensor = torch.arange(num_total_samples)
        full_indices_dataset = TensorDataset(all_indices_tensor)

        # Calculate split sizes (80% train, 20% validation)
        train_size = int(0.8 * num_total_samples)
        val_size = num_total_samples - train_size

        # Perform the split to get the indices for the validation set
        # The first returned dataset from random_split would be the training set indices,
        # the second is the validation set indices. We only need the latter.
        if train_size + val_size == num_total_samples: # Ensure split sizes sum correctly
            _, val_subset_indices_dataset = torch.utils.data.random_split(full_indices_dataset, [train_size, val_size])
            
            # val_subset_indices_dataset.indices contains the indices from the original full_indices_dataset
            # that belong to the validation set. These are the row indices for df_eval_cell_full.
            validation_indices = val_subset_indices_dataset.indices
            
            # Select the 20% validation split from the original DataFrame using these indices
            df_eval_cell = df_eval_cell_full.iloc[validation_indices].reset_index(drop=True)
            print(f"Deterministically selected the 20% validation split ({len(df_eval_cell)} samples out of {num_total_samples}) "
                  f"consistent with training procedure (using SEED: {SEED_EVAL}).")
        else:
            # This case should ideally not happen if num_total_samples > 0
            print(f"Warning: Could not perform split correctly for {num_total_samples} samples. Using full dataset for evaluation.")
            df_eval_cell = df_eval_cell_full 
    else:
        print("Dataset is empty. No samples to select for evaluation.")
        df_eval_cell = df_eval_cell_full # df_eval_cell will be an empty DataFrame

    # The original print statement, now conditional or adjusted
    if not df_eval_cell.empty:
        print(f"Using {len(df_eval_cell)} rows for evaluation, corresponding to the validation set.")
    elif num_total_samples > 0 and df_eval_cell.empty : # If split failed to select anything from non-empty
        print(f"Warning: Validation set selection resulted in 0 samples from {num_total_samples} total.")
    # If num_total_samples was 0, the "Dataset is empty" message already printed.

    eval_texts_list = df_eval_cell['content'].tolist()
    eval_labels_list = df_eval_cell['label'].tolist()
    print(f"Loaded {len(eval_texts_list)} samples for evaluation after cleaning and sampling.")

except FileNotFoundError:
    print(f"ERROR: Dataset file not found at {DATASET_PATH_EVAL}. Cannot proceed.")
except ValueError as ve:
    print(f"ERROR: Value error in dataset: {ve}. Cannot proceed.")
except Exception as e:
    print(f"Error loading or processing dataset: {e}. Cannot proceed.")

Successfully loaded teencode map from ./teencode.txt
Successfully loaded stopwords list from /data/elo/khanglg/FreeTxt-Flask/vietnamese-stopwords.txt
Using device: cuda:1
Successfully loaded 1942 Vietnamese stopwords from vietnamese-stopwords.txt.
Loading dataset from: sentiment_analysis_dataset.csv
Deterministically selected the 20% validation split (9523 samples out of 47611) consistent with training procedure (using SEED: 42).
Using 9523 rows for evaluation, corresponding to the validation set.
Loaded 9523 samples for evaluation after cleaning and sampling.


In [12]:
## Evaluating using 20% of the dataset like what we have done in the finetuning code

# --- Evaluation Loop ---
if eval_texts_list and eval_labels_list:
    evaluation_results = {}
    for model_id_eval in MODEL_IDENTIFIERS_EVAL:
        print(f"\n--- Evaluating model: {model_id_eval} ---")
        # model_full_path = os.path.join(OUTPUT_DIR_BASE_EVAL, model_id_eval)

        # if not os.path.exists(model_full_path):
        #     print(f"Model path not found: {model_full_path}. Skipping.")
        #     continue

        tokenizer_eval, model_eval = load_model_cell_eval(model_id_eval, NUM_LABELS_EVAL)

        if model_eval and tokenizer_eval:
            # Prepare dataloader using the validation split from the original dataset
            val_dataloader_for_eval = prepare_data_for_cell_eval(
                eval_texts_list, eval_labels_list, tokenizer_eval, 
                MAX_SEQ_LENGTH_EVAL, BATCH_SIZE_EVAL, SEED_EVAL
            )
            
            if val_dataloader_for_eval:
                accuracy_val, report_val = evaluate_model_cell_eval(model_eval, val_dataloader_for_eval)
                evaluation_results[model_id_eval] = {"accuracy": accuracy_val, "report": report_val}
                
                print(f"\nResults for {model_id_eval}:")
                print(f"Validation Accuracy: {accuracy_val:.4f}")
                print("Validation Classification Report:")
                print(report_val)
            else:
                print(f"Failed to create dataloader for {model_id_eval}. Skipping evaluation for this model.")
        else:
            print(f"Failed to load model or tokenizer for {model_id_eval}. Skipping.")

    # --- Comparison ---
    print("\n--- Overall Comparison (based on Validation Accuracy) ---")
    if evaluation_results:
        for model_id_res, res_data in evaluation_results.items():
            print(f"Model: {model_id_res}, Accuracy: {res_data['accuracy']:.4f}")
        
        if evaluation_results: # Ensure not empty before calling max
            best_model_name = max(evaluation_results, key=lambda k: evaluation_results[k]['accuracy'])
            print(f"\nBest performing model: {best_model_name} (Accuracy: {evaluation_results[best_model_name]['accuracy']:.4f})")
    else:
        print("No models were successfully evaluated, or no results were recorded.")
else:
    print("Evaluation cannot proceed: Dataset is empty or failed to load.")



--- Evaluating model: ./fine_tuned_sa_manual_newPreprocess_models/vietnamese-bi-encoder ---
Loading tokenizer from: ./fine_tuned_sa_manual_newPreprocess_models/vietnamese-bi-encoder
Loading model from: ./fine_tuned_sa_manual_newPreprocess_models/vietnamese-bi-encoder
Preprocessing texts for dataloaders...


Preprocessing V2: 100%|██████████| 9523/9523 [00:09<00:00, 1002.44it/s]


Tokenizing texts...
Using validation split for evaluation. Validation dataset size: 1905


Evaluating: 100%|██████████| 80/80 [00:04<00:00, 18.26it/s]



Results for ./fine_tuned_sa_manual_newPreprocess_models/vietnamese-bi-encoder:
Validation Accuracy: 0.9144
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.93      0.93      0.93       569
  Trung tính       0.70      0.62      0.66       229
    Tích cực       0.95      0.97      0.96      1107

    accuracy                           0.91      1905
   macro avg       0.86      0.84      0.85      1905
weighted avg       0.91      0.91      0.91      1905


--- Evaluating model: ./fine_tuned_sa_manual_newPreprocess_models/visobert ---
Loading tokenizer from: ./fine_tuned_sa_manual_newPreprocess_models/visobert
Loading model from: ./fine_tuned_sa_manual_newPreprocess_models/visobert
Preprocessing texts for dataloaders...


Preprocessing V2: 100%|██████████| 9523/9523 [00:09<00:00, 997.33it/s] 


Tokenizing texts...
Using validation split for evaluation. Validation dataset size: 1905


Evaluating: 100%|██████████| 80/80 [00:04<00:00, 18.62it/s]


Results for ./fine_tuned_sa_manual_newPreprocess_models/visobert:
Validation Accuracy: 0.9417
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.95      0.97      0.96       569
  Trung tính       0.82      0.72      0.77       229
    Tích cực       0.96      0.97      0.97      1107

    accuracy                           0.94      1905
   macro avg       0.91      0.89      0.90      1905
weighted avg       0.94      0.94      0.94      1905


--- Overall Comparison (based on Validation Accuracy) ---
Model: ./fine_tuned_sa_manual_newPreprocess_models/vietnamese-bi-encoder, Accuracy: 0.9144
Model: ./fine_tuned_sa_manual_newPreprocess_models/visobert, Accuracy: 0.9417

Best performing model: ./fine_tuned_sa_manual_newPreprocess_models/visobert (Accuracy: 0.9417)





In [13]:

## Evaluating using the whole dataset
# --- Evaluation Loop ---
if eval_texts_list and eval_labels_list:
    evaluation_results = {}
    for model_id_eval in MODEL_IDENTIFIERS_EVAL:
        print(f"\n--- Evaluating model: {model_id_eval} ---")
        # model_full_path = os.path.join(OUTPUT_DIR_BASE_EVAL, model_id_eval)

        # if not os.path.exists(model_full_path):
        #     print(f"Model path not found: {model_full_path}. Skipping.")
        #     continue

        tokenizer_eval, model_eval = load_model_cell_eval(model_id_eval, NUM_LABELS_EVAL)

        if model_eval and tokenizer_eval:
            # Prepare dataloader using the validation split from the original dataset
            val_dataloader_for_eval = prepare_data_for_cell_eval(
                eval_texts_list, eval_labels_list, tokenizer_eval, 
                MAX_SEQ_LENGTH_EVAL, BATCH_SIZE_EVAL, SEED_EVAL
            )
            
            if val_dataloader_for_eval:
                accuracy_val, report_val = evaluate_model_cell_eval(model_eval, val_dataloader_for_eval)
                evaluation_results[model_id_eval] = {"accuracy": accuracy_val, "report": report_val}
                
                print(f"\nResults for {model_id_eval}:")
                print(f"Validation Accuracy: {accuracy_val:.4f}")
                print("Validation Classification Report:")
                print(report_val)
            else:
                print(f"Failed to create dataloader for {model_id_eval}. Skipping evaluation for this model.")
        else:
            print(f"Failed to load model or tokenizer for {model_id_eval}. Skipping.")

    # --- Comparison ---
    print("\n--- Overall Comparison (based on Validation Accuracy) ---")
    if evaluation_results:
        for model_id_res, res_data in evaluation_results.items():
            print(f"Model: {model_id_res}, Accuracy: {res_data['accuracy']:.4f}")
        
        if evaluation_results: # Ensure not empty before calling max
            best_model_name = max(evaluation_results, key=lambda k: evaluation_results[k]['accuracy'])
            print(f"\nBest performing model: {best_model_name} (Accuracy: {evaluation_results[best_model_name]['accuracy']:.4f})")
    else:
        print("No models were successfully evaluated, or no results were recorded.")
else:
    print("Evaluation cannot proceed: Dataset is empty or failed to load.")


Using device: cuda:7
Successfully loaded 1942 Vietnamese stopwords from vietnamese-stopwords.txt.
Loading dataset from: sentiment_analysis_dataset.csv
Loaded 47611 samples for evaluation after cleaning.

--- Evaluating model: ./fine_tuned_sa_manual_models/vietnamese-bi-encoder ---
Loading tokenizer from: ./fine_tuned_sa_manual_models/vietnamese-bi-encoder
Loading model from: ./fine_tuned_sa_manual_models/vietnamese-bi-encoder
Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 47611/47611 [00:00<00:00, 107117.86it/s]


Tokenizing texts...
Using validation split for evaluation. Validation dataset size: 9523


Evaluating: 100%|██████████| 397/397 [00:31<00:00, 12.67it/s]



Results for ./fine_tuned_sa_manual_models/vietnamese-bi-encoder:
Validation Accuracy: 0.8756
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.87      0.87      0.87      2831
  Trung tính       0.69      0.51      0.59      1103
    Tích cực       0.90      0.95      0.93      5589

    accuracy                           0.88      9523
   macro avg       0.82      0.78      0.79      9523
weighted avg       0.87      0.88      0.87      9523


--- Evaluating model: ./fine_tuned_sa_manual_models/visobert ---
Loading tokenizer from: ./fine_tuned_sa_manual_models/visobert
Loading model from: ./fine_tuned_sa_manual_models/visobert
Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 47611/47611 [00:00<00:00, 85019.04it/s] 


Tokenizing texts...
Using validation split for evaluation. Validation dataset size: 9523


Evaluating: 100%|██████████| 397/397 [00:31<00:00, 12.53it/s]



Results for ./fine_tuned_sa_manual_models/visobert:
Validation Accuracy: 0.9097
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.91      0.90      0.91      2831
  Trung tính       0.81      0.67      0.74      1103
    Tích cực       0.92      0.96      0.94      5589

    accuracy                           0.91      9523
   macro avg       0.88      0.84      0.86      9523
weighted avg       0.91      0.91      0.91      9523


--- Evaluating model: ./fine_tuned_sa_manual_models/phobert-base-v2 ---
Loading tokenizer from: ./fine_tuned_sa_manual_models/phobert-base-v2
Loading model from: ./fine_tuned_sa_manual_models/phobert-base-v2
Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 47611/47611 [00:00<00:00, 103529.96it/s]


Tokenizing texts...
Using validation split for evaluation. Validation dataset size: 9523


Evaluating: 100%|██████████| 397/397 [00:31<00:00, 12.69it/s]



Results for ./fine_tuned_sa_manual_models/phobert-base-v2:
Validation Accuracy: 0.8620
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.87      0.86      0.86      2831
  Trung tính       0.65      0.44      0.52      1103
    Tích cực       0.89      0.95      0.92      5589

    accuracy                           0.86      9523
   macro avg       0.80      0.75      0.77      9523
weighted avg       0.85      0.86      0.85      9523


--- Evaluating model: ./fine_tuned_sa_manual_models/Multilingual-MiniLM-L12-H384 ---
Loading tokenizer from: ./fine_tuned_sa_manual_models/Multilingual-MiniLM-L12-H384
Loading model from: ./fine_tuned_sa_manual_models/Multilingual-MiniLM-L12-H384
Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 47611/47611 [00:00<00:00, 110152.98it/s]


Tokenizing texts...
Using validation split for evaluation. Validation dataset size: 9523


Evaluating: 100%|██████████| 397/397 [00:12<00:00, 32.60it/s]


Results for ./fine_tuned_sa_manual_models/Multilingual-MiniLM-L12-H384:
Validation Accuracy: 0.8140
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.79      0.81      0.80      2831
  Trung tính       0.50      0.24      0.33      1103
    Tích cực       0.85      0.93      0.89      5589

    accuracy                           0.81      9523
   macro avg       0.71      0.66      0.67      9523
weighted avg       0.79      0.81      0.80      9523


--- Overall Comparison (based on Validation Accuracy) ---
Model: ./fine_tuned_sa_manual_models/vietnamese-bi-encoder, Accuracy: 0.8756
Model: ./fine_tuned_sa_manual_models/visobert, Accuracy: 0.9097
Model: ./fine_tuned_sa_manual_models/phobert-base-v2, Accuracy: 0.8620
Model: ./fine_tuned_sa_manual_models/Multilingual-MiniLM-L12-H384, Accuracy: 0.8140

Best performing model: ./fine_tuned_sa_manual_models/visobert (Accuracy: 0.9097)





In [3]:
# Define base model Hugging Face identifiers corresponding to the finetuned models
# These are the original, non-finetuned versions from Hugging Face.
BASE_MODEL_HF_IDENTIFIERS = [
    "vinai/phobert-base-v2",
    "uitnlp/visobert",
    "bkai-foundation-models/vietnamese-bi-encoder",
    "microsoft/Multilingual-MiniLM-L12-H384"
]

print("\n\n--- Evaluating NON-FINETUNED Base Models ---")
print("This will evaluate the performance of the base models before any fine-tuning.")

# Ensure the evaluation dataset is available (loaded in a previous cell)
if 'eval_texts_list' in globals() and 'eval_labels_list' in globals() and \
   eval_texts_list and eval_labels_list:
    
    base_model_evaluation_results = {}
    for model_hf_id in BASE_MODEL_HF_IDENTIFIERS:
        print(f"\n--- Evaluating base model: {model_hf_id} ---")
        
        # Load the base model directly from Hugging Face.
        # load_model_cell_eval is assumed to be defined in a previous cell and
        # should handle Hugging Face model identifiers.
        # NUM_LABELS_EVAL is also assumed to be defined.
        tokenizer_base_eval, model_base_eval = load_model_cell_eval(model_hf_id, NUM_LABELS_EVAL)

        if model_base_eval and tokenizer_base_eval:
            # Prepare dataloader using the same evaluation data split
            # MAX_SEQ_LENGTH_EVAL, BATCH_SIZE_EVAL, SEED_EVAL are assumed to be defined.
            val_dataloader_for_base_eval = prepare_data_for_cell_eval(
                eval_texts_list, eval_labels_list, tokenizer_base_eval, 
                MAX_SEQ_LENGTH_EVAL, BATCH_SIZE_EVAL, SEED_EVAL
            )
            
            if val_dataloader_for_base_eval:
                # evaluate_model_cell_eval is assumed to be defined.
                accuracy_base_val, report_base_val = evaluate_model_cell_eval(model_base_eval, val_dataloader_for_base_eval)
                base_model_evaluation_results[model_hf_id] = {"accuracy": accuracy_base_val, "report": report_base_val}
                
                print(f"\nResults for base model {model_hf_id}:")
                print(f"Validation Accuracy: {accuracy_base_val:.4f}")
                print("Validation Classification Report:")
                print(report_base_val)
            else:
                print(f"Failed to create dataloader for base model {model_hf_id}. Skipping evaluation for this model.")
        else:
            print(f"Failed to load model or tokenizer for base model {model_hf_id}. Skipping.")

    # --- Comparison for Base Models ---
    print("\n--- Overall Comparison for NON-FINETUNED Base Models (based on Validation Accuracy) ---")
    if base_model_evaluation_results:
        for model_id_res, res_data in base_model_evaluation_results.items():
            print(f"Base Model: {model_id_res}, Accuracy: {res_data['accuracy']:.4f}")
        
        if base_model_evaluation_results: # Ensure not empty before calling max
            best_base_model_name = max(base_model_evaluation_results, key=lambda k: base_model_evaluation_results[k]['accuracy'])
            print(f"\nBest performing NON-FINETUNED base model: {best_base_model_name} (Accuracy: {base_model_evaluation_results[best_base_model_name]['accuracy']:.4f})")
    else:
        print("No non-finetuned base models were successfully evaluated, or no results were recorded.")
else:
    print("Evaluation of non-finetuned base models cannot proceed: Evaluation dataset (eval_texts_list, eval_labels_list) is empty or failed to load from previous cells.")




--- Evaluating NON-FINETUNED Base Models ---
This will evaluate the performance of the base models before any fine-tuning.

--- Evaluating base model: vinai/phobert-base-v2 ---
Loading tokenizer from: vinai/phobert-base-v2
Loading model from: vinai/phobert-base-v2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 9523/9523 [00:00<00:00, 109502.22it/s]

Tokenizing texts...





Using validation split for evaluation. Validation dataset size: 1905


Evaluating: 100%|██████████| 80/80 [00:03<00:00, 21.55it/s]



Results for base model vinai/phobert-base-v2:
Validation Accuracy: 0.2814
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.24      0.32      0.27       569
  Trung tính       0.14      0.42      0.21       229
    Tích cực       0.59      0.23      0.34      1107

    accuracy                           0.28      1905
   macro avg       0.32      0.32      0.27      1905
weighted avg       0.43      0.28      0.30      1905


--- Evaluating base model: uitnlp/visobert ---
Loading tokenizer from: uitnlp/visobert
Loading model from: uitnlp/visobert


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 9523/9523 [00:00<00:00, 110373.37it/s]

Tokenizing texts...





Using validation split for evaluation. Validation dataset size: 1905


Evaluating: 100%|██████████| 80/80 [00:03<00:00, 21.60it/s]



Results for base model uitnlp/visobert:
Validation Accuracy: 0.3528
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.33      0.07      0.12       569
  Trung tính       0.10      0.37      0.15       229
    Tích cực       0.61      0.49      0.55      1107

    accuracy                           0.35      1905
   macro avg       0.34      0.31      0.27      1905
weighted avg       0.46      0.35      0.37      1905


--- Evaluating base model: bkai-foundation-models/vietnamese-bi-encoder ---
Loading tokenizer from: bkai-foundation-models/vietnamese-bi-encoder
Loading model from: bkai-foundation-models/vietnamese-bi-encoder


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bkai-foundation-models/vietnamese-bi-encoder and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 9523/9523 [00:00<00:00, 116809.30it/s]

Tokenizing texts...





Using validation split for evaluation. Validation dataset size: 1905


Evaluating: 100%|██████████| 80/80 [00:03<00:00, 21.88it/s]



Results for base model bkai-foundation-models/vietnamese-bi-encoder:
Validation Accuracy: 0.4047
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.23      0.26      0.25       569
  Trung tính       0.14      0.13      0.13       229
    Tích cực       0.56      0.53      0.55      1107

    accuracy                           0.40      1905
   macro avg       0.31      0.31      0.31      1905
weighted avg       0.41      0.40      0.41      1905


--- Evaluating base model: microsoft/Multilingual-MiniLM-L12-H384 ---
Loading tokenizer from: microsoft/Multilingual-MiniLM-L12-H384
Loading model from: microsoft/Multilingual-MiniLM-L12-H384


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing texts for dataloaders...


Preprocessing: 100%|██████████| 9523/9523 [00:00<00:00, 115730.00it/s]

Tokenizing texts...





Using validation split for evaluation. Validation dataset size: 1905


Evaluating: 100%|██████████| 80/80 [00:01<00:00, 59.50it/s]


Results for base model microsoft/Multilingual-MiniLM-L12-H384:
Validation Accuracy: 0.5811
Validation Classification Report:
              precision    recall  f1-score   support

    Tiêu cực       0.00      0.00      0.00       569
  Trung tính       0.00      0.00      0.00       229
    Tích cực       0.58      1.00      0.74      1107

    accuracy                           0.58      1905
   macro avg       0.19      0.33      0.25      1905
weighted avg       0.34      0.58      0.43      1905


--- Overall Comparison for NON-FINETUNED Base Models (based on Validation Accuracy) ---
Base Model: vinai/phobert-base-v2, Accuracy: 0.2814
Base Model: uitnlp/visobert, Accuracy: 0.3528
Base Model: bkai-foundation-models/vietnamese-bi-encoder, Accuracy: 0.4047
Base Model: microsoft/Multilingual-MiniLM-L12-H384, Accuracy: 0.5811

Best performing NON-FINETUNED base model: microsoft/Multilingual-MiniLM-L12-H384 (Accuracy: 0.5811)





In [8]:
# 1. Combined example texts and their expected labels
# The original texts had comments like "# Positive" appended, which were parsed out.
# This version stores clean texts directly.
example_data_vi = [
    {"text": "Bộ phim này thực sự tuyệt vời! Diễn xuất và cốt truyện đều xuất sắc.", "expected_label": "Positive"},
    {"text": "Tôi không thích cuốn sách này lắm, nó khá nhàm chán và dễ đoán.", "expected_label": "Negative"},
    {"text": "Sản phẩm ở mức trung bình, không có gì đặc biệt nhưng cũng không tệ.", "expected_label": "Neutral"},
    {"text": "Dịch vụ khách hàng rất tệ, họ không giải quyết được vấn đề của tôi và rất thô lỗ.", "expected_label": "Negative"}, # Corresponds to "Very Negative" if mapping to 3 classes
    {"text": "Đây là một trong những trải nghiệm ẩm thực tốt nhất mà tôi từng có. Mọi thứ đều hoàn hảo!", "expected_label": "Positive"}, # Corresponds to "Very Positive" if mapping to 3 classes
    {"text": "Thời tiết hôm nay cũng bình thường, không nắng không mưa.", "expected_label": "Neutral"},
    {"text": "Chuyến đi thật kinh khủng, khách sạn bẩn và nhân viên thì thiếu chuyên nghiệp. Tôi sẽ không bao giờ quay lại đó nữa. Thật là một sự lãng phí tiền bạc.", "expected_label": "Negative"}  # Corresponds to "Strongly negative" if mapping to 3 classes
]

# Assumptions from previous cell (as per original comments):
# - NUM_LABELS_EVAL, MAX_SEQ_LENGTH_EVAL, load_model_cell_eval are defined.
# - 'torch' is imported (e.g., import torch).
# - 'labels' list might be defined globally if NUM_LABELS_EVAL == 3, used for human-readable labels.
# - device_eval might be defined (though model's device is used here for robustness).

# 2. Define a consistent label mapping for predictions
# This map will be used to convert predicted class indices to human-readable strings.
effective_idx_to_label_map = {}
if NUM_LABELS_EVAL == 3:
    # Prefer 'labels' global if it's valid and matches NUM_LABELS_EVAL for 3-class mapping
    if 'labels' in globals() and isinstance(labels, list) and len(labels) == NUM_LABELS_EVAL:
        effective_idx_to_label_map = {i: label for i, label in enumerate(labels)}
        print(f"Using 'labels' global for {NUM_LABELS_EVAL}-class mapping: {effective_idx_to_label_map}")
    else:
        # Default 3-class mapping if 'labels' is not suitable or not found
        effective_idx_to_label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
        if 'labels' not in globals():
            print(f"Warning: 'labels' global not found for {NUM_LABELS_EVAL}-class. Using default map: {effective_idx_to_label_map}")
        elif not (isinstance(labels, list) and len(labels) == NUM_LABELS_EVAL):
             print(f"Warning: Global 'labels' (value: {labels}) is not a list of {NUM_LABELS_EVAL} items. Using default map: {effective_idx_to_label_map}")
elif NUM_LABELS_EVAL == 5: # Example for 5-class models
    effective_idx_to_label_map = {
        0: "Very Negative", 
        1: "Negative", 
        2: "Neutral", 
        3: "Positive", 
        4: "Very Positive"
    }
else:
    # Fallback for other numbers of labels
    effective_idx_to_label_map = {i: f"Label_{i}" for i in range(NUM_LABELS_EVAL)}
    print(f"Warning: Using generic label map for {NUM_LABELS_EVAL} classes: {effective_idx_to_label_map}. Predictions might not be human-readable.")

print("\n--- Predicting on Example Vietnamese Texts ---")

# Models to use for this example prediction section.
# This list is defined within the selection in the original code.
model_ids_to_evaluate = ["./fine_tuned_sa_manual_models/visobert"] 

if not model_ids_to_evaluate:
    print("No models specified in 'model_ids_to_evaluate' for example predictions. Skipping.")
else:
    for model_id_pred in model_ids_to_evaluate:
        print(f"\n--- Predictions for model: {model_id_pred} ---")
        
        # Load tokenizer and model. load_model_cell_eval is expected to:
        # - Place the model on the correct device (e.g., device_eval).
        # - Set the model to evaluation mode (model.eval()).
        tokenizer_pred, model_pred = load_model_cell_eval(model_id_pred, NUM_LABELS_EVAL)
        
        if model_pred and tokenizer_pred:
            # Determine the device the model is currently on.
            current_device = next(model_pred.parameters()).device

            for example_item in example_data_vi:
                text_content = example_item["text"]
                current_ground_truth_label = example_item["expected_label"]
                
                # Tokenize the text
                inputs = tokenizer_pred(
                    text_content, 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=MAX_SEQ_LENGTH_EVAL # Assumed from previous cell
                )
                
                # Move inputs to the same device as the model
                inputs = {k: v.to(current_device) for k, v in inputs.items()}
                
                # Make prediction
                with torch.no_grad():
                    outputs = model_pred(**inputs)
                
                logits = outputs.logits
                predicted_class_idx = torch.argmax(logits, dim=1).item()
                
                # Map predicted index to label string using the consolidated map
                predicted_label = effective_idx_to_label_map.get(predicted_class_idx, f"Unknown_Index_{predicted_class_idx}")
                
                print(f"Text: \"{text_content}\"")
                print(f"  Expected: {current_ground_truth_label}")
                print(f"  Predicted: {predicted_label}\n")
        else:
            print(f"Failed to load model or tokenizer for {model_id_pred}. Skipping predictions for this model.")



--- Predicting on Example Vietnamese Texts ---

--- Predictions for model: ./fine_tuned_sa_manual_models/visobert ---
Loading tokenizer from: ./fine_tuned_sa_manual_models/visobert
Loading model from: ./fine_tuned_sa_manual_models/visobert
Text: "Bộ phim này thực sự tuyệt vời! Diễn xuất và cốt truyện đều xuất sắc."
  Expected: Positive
  Predicted: Positive

Text: "Tôi không thích cuốn sách này lắm, nó khá nhàm chán và dễ đoán."
  Expected: Negative
  Predicted: Negative

Text: "Sản phẩm ở mức trung bình, không có gì đặc biệt nhưng cũng không tệ."
  Expected: Neutral
  Predicted: Neutral

Text: "Dịch vụ khách hàng rất tệ, họ không giải quyết được vấn đề của tôi và rất thô lỗ."
  Expected: Negative
  Predicted: Negative

Text: "Đây là một trong những trải nghiệm ẩm thực tốt nhất mà tôi từng có. Mọi thứ đều hoàn hảo!"
  Expected: Positive
  Predicted: Positive

Text: "Thời tiết hôm nay cũng bình thường, không nắng không mưa."
  Expected: Neutral
  Predicted: Positive

Text: "Chuyến đi t

In [5]:
#For testing with single sample

try:
    with open('/data/elo/khanglg/FreeTxt-Flask/vietnamese-stopwords.txt', 'r', encoding='utf-8') as f:
        vi_stopwords = [line.strip() for line in f if line.strip()]
    print(f"Successfully loaded {len(vi_stopwords)} Vietnamese stopwords.")
except FileNotFoundError:
    print("Vietnamese stopwords file not found. Please check the path.")
    vi_stopwords = []

# Define punctuation
PUNCS = '''!→()-[]{};:'"\,<>?@#$%^&*_~'''

# Cell 3: Device Configuration
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def preprocess_text(text, language='vi'):
    """
    Preprocesses text for sentiment analysis:
    - Converts input to string to handle potential NaNs or other types
    - Removes URLs, mentions, hashtags
    - Removes punctuation
    - Converts to lowercase
    - Removes stopwords
    """
    text = str(text) # Convert text to string to prevent TypeError
    text = re.sub(r"http\\S+|@\\S+|#\\S+", "", text)
    text = re.sub(f"[{re.escape(''.join(PUNCS))}]", "", text.lower())
    text = " ".join(word for word in text.split() if word not in vi_stopwords)
    return text

def predict_sentiment(text, tokenizer, model, preprocess=True):
    """
    Predicts the sentiment of a single text.
    Returns the predicted class and confidence.
    """
    if preprocess:
        text = preprocess_text(text)
    
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to('cuda')
    attention_mask = encoding['attention_mask'].to('cuda')
    
    with torch.no_grad():
        if isinstance(model, torch.nn.Module) and not hasattr(model, 'config'):
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
        else:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
    
    probabilities = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()[0]
    predicted_class = np.argmax(probabilities)
    confidence = probabilities[predicted_class]
    
    # Map class to sentiment
    sentiment_map = {0: 'Tiêu cực', 1: 'Trung tính', 2: 'Tích cực'}
    predicted_sentiment = sentiment_map.get(predicted_class, 'Unknown')
    
    return predicted_sentiment, confidence, probabilities



Successfully loaded 1942 Vietnamese stopwords.
Using device: cuda:1


In [15]:
test_tokenizer,test_model = load_model_cell_eval("./fine_tuned_sa_manual_models/visobert",3)

Loading tokenizer from: ./fine_tuned_sa_manual_models/visobert
Loading model from: ./fine_tuned_sa_manual_models/visobert


In [None]:
sample_text = "Đồ ăn như con cặc vậy á đéo tin được!"

bkai_sentiment, bkai_conf, bkai_probs = predict_sentiment(sample_text, test_tokenizer, test_model)
# pho_sentiment, pho_conf, pho_probs = predict_sentiment(sample_text, pho_tokenizer, pho_model)

print(f"Sample text: '{sample_text}'")
print(f"VISOBERT prediction: {bkai_sentiment} (confidence: {bkai_conf:.4f})")

Sample text: 'Đồ ăn như con cặc vậy á đéo tin được!'
VISOBERT prediction: Tiêu cực (confidence: 0.9943)


: 

In [11]:
tokenizer,model = load_model_cell_eval("shenkha/FreeTxT-VisoBERT",3)
model.to('cuda:1')

Loading tokenizer from: shenkha/FreeTxT-VisoBERT
Loading model from: shenkha/FreeTxT-VisoBERT


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=7

In [14]:
sample_text = "Đồ ăn như con cặc vậy á đéo tin được!"

sentiment, conf, probs = predict_sentiment(sample_text, tokenizer, model)


print(f"Sample text: '{sample_text}'")
print(f"VISOBERT prediction: {sentiment} (confidence: {conf:.4f})")

Sample text: 'Đồ ăn như con cặc vậy á đéo tin được!'
VISOBERT prediction: Tiêu cực (confidence: 0.9670)
