In [23]:
pip install datasets



In [24]:
pip install transformers datasets evaluate seqeval



In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np

In [26]:
# Load the Broad Twitter Corpus (BTC) dataset
dataset = load_dataset("tner/btc")

# Extract the entity labels from the 'tags' field
label_list = dataset["train"].features["tags"].feature.names
num_labels = len(label_list)

In [27]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

 Token-Tag Alignment

In [28]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()

    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)  # Special tokens get -100 label
        else:
            label_ids.append(example["tags"][word_idx])

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [29]:
# Increase dataset sample size to 50% for better training
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(int(0.5 * len(dataset["train"]))))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(int(0.5 * len(dataset["validation"]))))


In [30]:
# Apply tokenization to the sampled dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/3169 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [31]:
# Load pre-trained BERT model with token classification head
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Data collator handles padding and label alignment during batching
data_collator = DataCollatorForTokenClassification(tokenizer)


In [33]:
# Metric computation using seqeval for named entity recognition
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [label_list[pred] for pred, lab in zip(pred_row, label_row) if lab != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[lab] for pred, lab in zip(pred_row, label_row) if lab != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)
    overall_f1 = report.get("macro avg", {}).get("f1-score", 0.0)
    return {"f1": overall_f1}

In [35]:
!pip install transformers --upgrade



In [None]:
# Metrics function to compute precision, recall, and F1 score
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=2)  # Get the predicted labels
    labels = p.label_ids  # Ground truth labels

    # Convert labels and predictions into lists of labels for each sentence
    true_predictions = [
        [label_list[pred] for pred, lab in zip(pred_row, label_row) if lab != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[lab] for pred, lab in zip(pred_row, label_row) if lab != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    # Flatten the lists to calculate the overall precision, recall, and F1
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    true_predictions_flat = [item for sublist in true_predictions for item in sublist]

    # Calculate precision, recall, and F1 scores for each entity type
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels_flat, true_predictions_flat, average=None, labels=['PER', 'LOC', 'ORG', 'MISC']
    )

    # Print performance by entity type
    print("\nPerformance by Entity Type:")
    for entity, p, r, f in zip(['PER', 'LOC', 'ORG', 'MISC'], precision, recall, f1):
        print(f"Entity: {entity} | Precision: {p:.2f} | Recall: {r:.2f} | F1 Score: {f:.2f}")

    # Calculate and print the overall F1 score (macro avg)
    overall_f1 = np.mean(f1)
    print(f"\nOverall F1 Score (macro avg): {overall_f1:.2f}")

    # Return the metrics dictionary
    return {"precision": precision, "recall": recall, "f1": overall_f1}

In [37]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./btc-ner-model",              # Output directory for model
    learning_rate=2e-5,
    per_device_train_batch_size=16,            # Larger batch size (if GPU memory allows)
    per_device_eval_batch_size=16,
    num_train_epochs=4,                         # Slightly longer training for better learning
    weight_decay=0.01,
    logging_dir="./logs"                        # Log directory
)

In [38]:
# Initialize the Trainer API from HuggingFace
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# Start training the model
trainer.train()

Step,Training Loss
500,0.203


In [42]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print("Validation Results:", eval_results)

Validation Results: {'eval_loss': 0.33263036608695984, 'eval_f1': 0.5244726094248141, 'eval_runtime': 101.234, 'eval_samples_per_second': 4.939, 'eval_steps_per_second': 0.316, 'epoch': 4.0}
