In [None]:
!pip install transformers datasets seqeval scikit-learn --quiet

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from sklearn.metrics import classification_report
import numpy as np
import torch


In [None]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)


: 

In [None]:
def read_conll(filepath):
    tokens, labels, examples = [], [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    examples.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
            else:
                token, tag = line.split()
                tokens.append(token)
                labels.append(tag)
    return Dataset.from_list(examples)


: 

In [None]:
dataset = read_conll("../labeled_data/ner_labels.conll")  # Adjust path if needed

# Create label mappings
label_list = sorted(set(tag for ex in dataset for tag in ex['ner_tags']))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


: 

In [None]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["ner_tags"][word_idx]])
        else:
            labels.append(label2id[example["ner_tags"][word_idx]])
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

encoded_dataset = dataset.map(tokenize_and_align_labels)


: 

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


: 

In [None]:
args = TrainingArguments(
    output_dir="./models",
    evaluation_strategy="no",  # No validation yet
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)


: 

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer)
)

trainer.train()


: 

In [None]:
trainer.save_model("./models/final_ner_model")
tokenizer.save_pretrained("./models/final_ner_model")
