In [1]:
from datasets import load_dataset
import torch

In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    print ("MPS device not found.")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# device = "cpu"

In [4]:

from transformers import BertTokenizerFast

def load_and_preprocess_data(dataset_name='conll2003'):
    # Load dataset
    dataset = load_dataset(dataset_name)
    # dataset["train"] = dataset["train"].select(range(10000))
    # dataset["validation"] = dataset["validation"].select(range(3000))
    # dataset["test"] = dataset["test"].select(range(3000))
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
        labels = []
        
        for i, label in enumerate(examples['ner_tags']):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)  # Special token
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)  # Word continuation
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    # Apply function to each split
    dataset = dataset.map(tokenize_and_align_labels, batched=True)
    return dataset




In [5]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    # Separate the input ids and labels
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    # Pad the sequences
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 is the default ignore index in PyTorch cross-entropy loss

    # Create attention masks
    attention_masks = (input_ids != tokenizer.pad_token_id).long()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }


In [6]:
dataset = load_and_preprocess_data()

In [7]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# inputs    = tokenizer(sentence, return_tensors="pt").to(device)
# model     = model.to(device)
# outputs   = model(**inputs)

In [8]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=dataset['train'].features['ner_tags'].feature.num_classes).to(device)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print(model.device)

mps:0


In [10]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [11]:
# train_loader = DataLoader(dataset['train'][:14000])

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [13]:


def train(model, train_loader, learning_rate=0.001, epochs=3):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_loader = DataLoader(dataset['train'], batch_size=16, shuffle=True, collate_fn=collate_fn)

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        model.train()
        for batch in train_loader:
            # Move batch to the same device as model
            batch = {k: v.to(model.device) for k, v in batch.items()}
            
            optimizer.zero_grad()
            output = model(**batch)
            loss = output.loss
            loss.backward()
            optimizer.step()
            print(f'Loss: {loss.item()}')


train(model, dataset)




Epoch 1/3
Loss: 2.2275941371917725


KeyboardInterrupt: 