In [None]:
import os
import csv
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from seqeval.metrics import f1_score, precision_score, recall_score

# Load tag mapping from tag_list.csv
def load_tag_mapping(file_path):
    tag2id = {}
    with open(file_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            tag, tag_id = row
            tag2id[tag] = int(tag_id)
    id2tag = {v: k for k, v in tag2id.items()}
    return tag2id, id2tag


# File paths - modify these to match your actual file locations
tag_list_path = "tag_list.csv"
train_folder = "train/train"
eval_folder = "eval/eval"

# Ensure directories exist
os.makedirs("./results", exist_ok=True)
os.makedirs("./logs", exist_ok=True)
os.makedirs("./fine_tuned_ner_model", exist_ok=True)

# Load tag mappings
tag2id, id2tag = load_tag_mapping(tag_list_path)

# Use tokenizer from the Thai NER model
tokenizer = AutoTokenizer.from_pretrained(
    "airesearch/wangchanberta-base-att-spm-uncased", use_fast=True
)


# Function to load and prepare datasets
def prepare_dataset(folder_path):
    all_tokens = []
    all_tags = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                tokens, tags = [], []
                for line in file:
                    line = line.strip()
                    if not line:
                        if tokens:
                            all_tokens.append(tokens)
                            all_tags.append(
                                [tag if tag in tag2id else "O" for tag in tags]
                            )
                            tokens, tags = [], []
                    else:
                        parts = line.split("\t")
                        if len(parts) == 4:
                            word, _, ner_tag, _ = parts
                            tokens.append(word)
                            tags.append(ner_tag)
                if tokens:
                    all_tokens.append(tokens)
                    all_tags.append(
                        [tag if tag in tag2id else "O" for tag in tags]
                    )

    return all_tokens, all_tags


# Prepare datasets
train_tokens, train_tags = prepare_dataset(train_folder)
eval_tokens, eval_tags = prepare_dataset(eval_folder)


# Tokenization function
def tokenize_and_align_labels(tokens, tags):
    tokenized_inputs = tokenizer(
        tokens,
        truncation=True,
        is_split_into_words=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
    )

    labels = []
    for i in range(len(tokens)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(
                    tag2id.get(tags[i][word_idx], tag2id["O"])
                    if word_idx < len(tags[i])
                    else -100
                )

        labels.append(label_ids)

    tokenized_inputs["labels"] = torch.tensor(labels)
    return tokenized_inputs


# Tokenize and align labels
train_encodings = tokenize_and_align_labels(train_tokens, train_tags)
eval_encodings = tokenize_and_align_labels(eval_tokens, eval_tags)

# Create PyTorch datasets
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


train_dataset = NERDataset(train_encodings)
eval_dataset = NERDataset(eval_encodings)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load pre-trained model with updated number of labels
model = AutoModelForTokenClassification.from_pretrained(
    "airesearch/wangchanberta-base-att-spm-uncased", num_labels=len(tag2id)
)


# Metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Filter out -100 labels
    true_labels = [
        [id2tag[label] for label in sent if label != -100] for sent in labels
    ]
    true_preds = [
        [id2tag[pred] for pred, label in zip(sent, labels[i]) if label != -100]
        for i, sent in enumerate(preds)
    ]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }


# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    dataloader_num_workers=4,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

# Evaluate on validation set
eval_results = trainer.evaluate()
print("Validation Results:", eval_results)

# Save model and tokenizer
model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained("./fine_tuned_ner_model")

print("Model training and evaluation completed successfully.")


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
