In [None]:
!pip install -U transformers accelerate

In [None]:
import torch
import transformers
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import torch.quantization

In [None]:
# Load the Kaggle NER dataset
dataset = load_dataset("conll2003")

In [None]:
model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Tokenization function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Ensures uniform length
        max_length=128,
        is_split_into_words=True
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])  # Apply same label to subwords
            previous_word_idx = word_idx
        
        # Ensure labels match the length of the tokenized input
        label_ids += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(label_ids))
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
# Load model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(dataset["train"].features["ner_tags"].feature.names))

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./ner_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

In [None]:
# Train the model
trainer.train()