In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Dataset
file_path = "./nba-thaicomments-dataset/NBA_Comment_Data.xlsx"
df = pd.read_excel(file_path)

df = df.rename(columns={'data': 'text', 'label': 'label'})

# Map labels to integers if necessary
label_map = {'P': 0, 'Neu': 1, 'Neg': 2}
df['label'] = df['label'].map(label_map)

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train, validation, and test sets
dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
test_dataset = dataset_split['test']

# Further split the training set into train and validation sets
train_val_dataset = dataset_split['train'].train_test_split(test_size=0.1, seed=42)  # 10% of train for validation

# Combine the splits into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_val_dataset['train'],
    'validation': train_val_dataset['test'],
    'test': test_dataset
})

# Save the DatasetDict
dataset_dict.save_to_disk('/kaggle/working/nba_comments_dataset')

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("poom-sci/WangchanBERTa-finetuned-sentiment")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized = dataset_dict.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("poom-sci/WangchanBERTa-finetuned-sentiment")

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()