In [None]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForTokenClassification
from datasets import load_dataset, load_metric
from seqeval.metrics import f1_score

In [None]:
import torch
from transformers import BertForTokenClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from sklearn.metrics import f1_score
import numpy as np

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

device = "mps"

# Initialize BERT model and tokenizer
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(dataset['train'].features['ner_tags'].feature.names)).to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and preprocess the dataset
def tokenize_and_preprocess_batch(batch):
    inputs = tokenizer(batch["tokens"], padding=True, truncation=True, return_tensors="pt", is_split_into_words=True).to(device)
    inputs["labels"] = batch["ner_tags"]
    return inputs

train_dataset = dataset["train"].map(tokenize_and_preprocess_batch, batched=True)
eval_dataset = dataset["validation"].map(tokenize_and_preprocess_batch, batched=True)

# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=64)

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

# # Training loop
# num_epochs = 3
# model.to(device)
# model.train()

# for epoch in range(num_epochs):
#     total_loss = 0
#     predictions = []
#     true_labels = []

#     for batch in train_dataloader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         optimizer.zero_grad()

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#         total_loss += loss.item()

#     print(f"Epoch {epoch + 1} - Average Loss: {total_loss / len(train_dataloader)}")

# Evaluation



In [8]:
import torch.nn.utils.rnn as rnn_utils

# ... (previous code)

# Evaluation loop
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Sort sequences in the batch by length and pad them to have equal length
        input_ids, sorted_lengths = rnn_utils.pad_packed_sequence(
            rnn_utils.pack_padded_sequence(input_ids, batch_first=True, enforce_sorted=False)
        )
        attention_mask, _ = rnn_utils.pad_packed_sequence(
            rnn_utils.pack_padded_sequence(attention_mask, batch_first=True, enforce_sorted=False)
        )
        labels, _ = rnn_utils.pad_packed_sequence(
            rnn_utils.pack_padded_sequence(labels, batch_first=True, enforce_sorted=False)
        )

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predictions.extend(torch.argmax(logits, dim=2).tolist())
        true_labels.extend(labels.tolist())

predictions = np.array(predictions).flatten()
true_labels = np.array(true_labels).flatten()

# Calculate F1 score for NER task
f1 = f1_score(true_labels, predictions, average="micro")
print(f"F1 Score: {f1}")


RuntimeError: each element in list of batch should be of equal size

In [3]:


# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 

# Load dataset and tokenize 
dataset = load_dataset("conll2003")
train_encodings = tokenizer(dataset['train'], 
                            is_split_into_words=True,
                            return_offsets_mapping=True,
                            padding=True,
                            truncation=True)

# Convert to PyTorch tensors                        
train_ids = torch.tensor(train_encodings.input_ids)
train_mask = torch.tensor(train_encodings.attention_mask)  
train_tags = torch.tensor(encode_tags(train_labels))

# Create PyTorch DataLoader
train_dataset = TensorDataset(train_ids, train_mask, train_tags)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, 
                              sampler=train_sampler, 
                              batch_size=32)
                              
# Load pretrained BERT for token classification                               
model = BertForTokenClassification.from_pretrained(
    'bert-base-cased',
    num_labels=len(tag_values))  

# Freeze BERT layers
for param in model.bert.parameters():
    param.requires_grade = False
    
# Training loop    
optimizer = AdamW(model.parameters(), 
                  lr = 5e-5, 
                  eps = 1e-8)
                      
for epoch in range(3):
    for batch in train_dataloader:
        model.train()
        loss = model(batch)[0] 
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    print(f1_score(true_tags, pred_tags))

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).