In [7]:
!pip3 install transformers datasets scikit-learn evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [43]:
!pip3 install torch torchvision torchaudio

Collecting torchaudio
  Downloading torchaudio-2.5.1-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
Downloading torchaudio-2.5.1-cp312-cp312-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.4 MB 2.1 MB/s eta 0:00:01
   ----------------- ---------------------- 1.0/2.4 MB 2.6 MB/s eta 0:00:01
   ------------------------- -------------- 1.6/2.4 MB 2.6 MB/s eta 0:00:01
   ---------------------------------- ----- 2.1/2.4 MB 2.4 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 2.2 MB/s eta 0:00:00
Installing collected packages: torchaudio
Successfully installed torchaudio-2.5.1


In [47]:
import os
import gc
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [17]:
os.environ["WANDB_DISABLED"] = "true"

In [28]:
import csv

def load_tag_mapping(file_path):
    tag2id = {}
    with open(file_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            tag, tag_id = row
            tag2id[tag] = int(tag_id)
    id2tag = {v: k for k, v in tag2id.items()}
    return tag2id, id2tag

In [29]:
# File paths - modify these to match your actual file locations
tag_list_path = "tag_list.csv"
train_folder = "train/train"
eval_folder = "eval/eval"

# Ensure directories exist
os.makedirs("./results", exist_ok=True)
os.makedirs("./logs", exist_ok=True)
os.makedirs("./fine_tuned_ner_model", exist_ok=True)

# Load tag mappings
tag2id, id2tag = load_tag_mapping(tag_list_path)

# Use tokenizer from the Thai NER model
tokenizer = AutoTokenizer.from_pretrained(
    "airesearch/wangchanberta-base-att-spm-uncased", use_fast=True
)

In [48]:
# Function to load and prepare datasets
def prepare_dataset(folder_path):
    all_tokens = []
    all_tags = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                tokens, tags = [], []
                for line in file:
                    line = line.strip()
                    if not line:
                        if tokens:
                            all_tokens.append(tokens)
                            all_tags.append(
                                [tag if tag in tag2id else "O" for tag in tags]
                            )
                            tokens, tags = [], []
                    else:
                        parts = line.split("\t")
                        if len(parts) == 4:
                            word, _, ner_tag, _ = parts
                            tokens.append(word)
                            tags.append(ner_tag)
                if tokens:
                    all_tokens.append(tokens)
                    all_tags.append(
                        [tag if tag in tag2id else "O" for tag in tags]
                    )

    return all_tokens, all_tags


# Specify folder paths
train_folder = "train/train"
eval_folder = "eval/eval"

train_tokens, train_tags = prepare_dataset(train_folder)
eval_tokens, eval_tags = prepare_dataset(eval_folder)

# Convert the prepared datasets into Hugging Face Dataset format
def create_dataset(tokens, tags):
    data = {"tokens": tokens, "labels": [[tag2id[tag] for tag in tag_list] for tag_list in tags]}
    return Dataset.from_dict(data)

train_dataset = create_dataset(train_tokens, train_tags)
eval_dataset = create_dataset(eval_tokens, eval_tags)

In [49]:
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# Define the metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    
    # Convert predictions and labels to tag sequences
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    # Calculate token-level accuracy
    correct = sum(
        pred == label
        for preds, labels in zip(true_predictions, true_labels)
        for pred, label in zip(preds, labels)
    )
    total = sum(len(labels) for labels in true_labels)
    accuracy = correct / total

    # Calculate precision, recall, and F1 using seqeval
    f1 = f1_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [52]:
# Define models to train
models = [
    "bert-base-multilingual-cased",
    "xlm-roberta-base",
]

trained_models = []

# Detect the available device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Training loop
for model_name in models:
    print(f"Training model: {model_name}")

    # Clear GPU cache (optional)
    torch.cuda.empty_cache()

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name, num_labels=len(tag2id)
    ).to(device)  # Move model to the detected device (CPU or GPU)

    # Tokenize and align labels
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"],
            truncation=True,
            padding="max_length",
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples["labels"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            aligned_labels = [
                -100 if word_id is None else label[word_id] for word_id in word_ids
            ]
            labels.append(aligned_labels)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_train = train_dataset.map(
        tokenize_and_align_labels, batched=True, num_proc=8,  # Parallelize preprocessing
    )
    tokenized_eval = eval_dataset.map(
        tokenize_and_align_labels, batched=True, num_proc=8,
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,  # Adjust batch size based on CPU/GPU
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
        logging_dir=f"./logs_{model_name}",
        logging_steps=10,
        bf16=False,  # Disabled BF16 as it's for GPUs
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    )
    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,  # Ensure compute_metrics is defined
    )

    # Train model
    trainer.train()
    trained_models.append((model_name, model, trainer))

    # Clear GPU cache (optional)
    del model, trainer
    gc.collect()
    torch.cuda.empty_cache()  # No effect but left for consistency

Using device: cpu
Training model: bert-base-multilingual-cased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=8):   0%|          | 0/63310 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [None]:
# Evaluate trained models
for model_name, model, trainer in trained_models:
    print(f"Evaluating model: {model_name}")
    results = trainer.evaluate()
    print(f"Model: {model_name}")
    print(f"Accuracy: {results['eval_accuracy']}")
    print(f"F1 Score: {results['eval_f1']}")

In [None]:
# Step 9: Save the Best Model
best_model_name, best_model, _ = max(trained_models, key=lambda x: x[2].evaluate()["eval_loss"])
best_model.save_pretrained("./best_model")
tokenizer.save_pretrained("./best_model")

# Step 10: Generate Submission File
def predict_tag(word):
    prediction = ner_pipeline(word)
    return prediction[0]["entity"] if prediction else "O"

ner_pipeline = pipeline("ner", model="./best_model", tokenizer="./best_model", aggregation_strategy="simple")

In [None]:
# Creating Submission File
with open(submission_output_path, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["id", "ne"])
    for file_name in sorted(os.listdir(test_folder)):
        if file_name.endswith(".txt"):
            file_path = os.path.join(test_folder, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                for i, line in enumerate(f):
                    line = line.strip()
                    if not line:
                        continue
                    word = line.split("\t")[0]  # Get the first element (word)
                    predicted_tag = predict_tag(word)  # Use the best model for prediction
                    # Format the 'id' as file_name_line_number
                    writer.writerow([f"{file_name.split('.')[0]}_{i}", predicted_tag])