In [1]:
#Import statements and paths

import os
import json
import logging
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define paths based on your structure
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..'))
TRAIN_FILE = os.path.join(BASE_PATH, 'dataset', 'ner_training_data', 'train.json')
VAL_FILE = os.path.join(BASE_PATH, 'dataset', 'ner_training_data', 'val.json')
OUTPUT_DIR = os.path.join(BASE_PATH, 'models', 'ner_model')

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)




In [18]:
# Second cell - Complete NERTrainer class with all methods
class NERTrainer:
    def __init__(
        self,
        model_name="bert-base-uncased",
        max_length=128,
        train_batch_size=16,
        eval_batch_size=16,
        learning_rate=5e-5,
        num_epochs=5
    ):
        self.model_name = model_name
        self.max_length = max_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        
        # Define label mappings
        self.label2id = {"O": 0, "ANIMAL": 1}
        self.id2label = {0: "O", 1: "ANIMAL"}
        
        # Initialize tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=len(self.label2id),
            id2label=self.id2label,
            label2id=self.label2id
        )
        
        self.data_collator = DataCollatorForTokenClassification(
            self.tokenizer,
            pad_to_multiple_of=8
        )

    def load_data(self, train_file: str, val_file: str):
        """Load and preprocess the data"""
        def process_file(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            processed_data = []
            for item in data:
                try:
                    processed = self.prepare_example(item["sentence"], item["entities"])
                    processed_data.append(processed)
                except Exception as e:
                    logger.warning(f"Error processing example: {str(e)}")
                    continue
            
            return Dataset.from_dict({
                "tokens": [x["tokens"] for x in processed_data],
                "labels": [x["labels"] for x in processed_data]
            })
        
        train_dataset = process_file(train_file)
        val_dataset = process_file(val_file)
        
        logger.info(f"Loaded {len(train_dataset)} training examples")
        logger.info(f"Loaded {len(val_dataset)} validation examples")
        
        return train_dataset, val_dataset

    def prepare_example(self, text: str, entities: list):
        """Prepare a single example with accurate token labeling"""
        words = text.split()
        labels = ["O"] * len(words)
        
        for start, end, label in entities:
            entity_positions = self.get_entity_positions(text, start, end)
            for pos in entity_positions:
                if pos < len(labels):
                    labels[pos] = "ANIMAL"
        
        return {
            "tokens": words,
            "labels": [self.label2id[label] for label in labels]
        }

    def get_entity_positions(self, text: str, start: int, end: int):
        """Get token positions for an entity"""
        words = text.split()
        char_count = 0
        entity_tokens = []
        
        for i, word in enumerate(words):
            word_start = char_count
            word_end = char_count + len(word)
            
            if i > 0:  # Add space after first word
                word_start += 1
                word_end += 1
                char_count += 1
            
            if word_end > start and word_start < end:
                entity_tokens.append(i)
            
            char_count += len(word)
        
        return entity_tokens

    def tokenize_and_align_labels(self, examples):
        """Tokenize and align labels with tokens"""
        tokenized_inputs = self.tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True,
            max_length=self.max_length,
            padding="max_length"
        )

        labels = []
        for i, label in enumerate(examples["labels"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []

            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    def compute_metrics(self, p):
        """Compute metrics for evaluation"""
        predictions = np.argmax(p.predictions, axis=2)
        
        true_labels = [[l for l in label if l != -100] for label in p.label_ids]
        true_predictions = [
            [p for (p, l) in zip(pred, gold_label) if l != -100]
            for pred, gold_label in zip(predictions, p.label_ids)
        ]

        precision, recall, f1, _ = precision_recall_fscore_support(
            [l for labels in true_labels for l in labels],
            [p for preds in true_predictions for p in preds],
            average='binary',
            zero_division=0
        )
        
        accuracy = accuracy_score(
            [l for labels in true_labels for l in labels],
            [p for preds in true_predictions for p in preds]
        )

        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    def train(self, train_dataset, val_dataset, output_dir: str):
        """Train the model"""
        # Process datasets
        tokenized_train = train_dataset.map(
            self.tokenize_and_align_labels,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        tokenized_val = val_dataset.map(
            self.tokenize_and_align_labels,
            batched=True,
            remove_columns=val_dataset.column_names
        )

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            learning_rate=self.learning_rate,
            per_device_train_batch_size=self.train_batch_size,
            per_device_eval_batch_size=self.eval_batch_size,
            num_train_epochs=self.num_epochs,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            logging_dir=os.path.join(output_dir, "logs"),
            logging_steps=10
        )

        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            data_collator=self.data_collator,
            compute_metrics=self.compute_metrics
        )

        # Train model
        logger.info("Starting training...")
        trainer.train()
        
        # Save final model
        model_save_path = os.path.join(output_dir, "final")
        self.model.save_pretrained(model_save_path)
        self.tokenizer.save_pretrained(model_save_path)
        logger.info(f"Model saved to {model_save_path}")

In [19]:
# Initialize and train the model
trainer = NERTrainer()
train_dataset, val_dataset = trainer.load_data(TRAIN_FILE, VAL_FILE)
trainer.train(train_dataset, val_dataset, OUTPUT_DIR)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Loaded 444 training examples
INFO:__main__:Loaded 111 validation examples


Map:   0%|          | 0/444 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

INFO:__main__:Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0064,0.000191,1.0,1.0,1.0,1.0
2,0.0002,6.5e-05,1.0,1.0,1.0,1.0
3,0.0001,5.2e-05,1.0,1.0,1.0,1.0
4,0.0001,4.6e-05,1.0,1.0,1.0,1.0
5,0.0001,4.4e-05,1.0,1.0,1.0,1.0


INFO:__main__:Model saved to C:\Users\mar4u\Documents\DS-Test-2025\task2\models\ner_model\final
