In [None]:
# %pip install seqeval

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer
import numpy as np


In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    print ("MPS device not found.")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# device = "cpu"

In [4]:
dataset = load_dataset("conll2003")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=False, is_split_into_words=True, padding="max_length", max_length=256)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [5]:
label_list = [
    "O",
	"B-MISC", "I-MISC",
	"B-PER", "I-PER",
	"B-ORG", "I-ORG",
	"B-LOC", "I-LOC"
]

In [6]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(dataset['train'].features['ner_tags'].feature.names)).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1,
    learning_rate=1e-3,
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
metric = load_metric("seqeval")

# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)
#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     results = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }

# Evaluate the model
# trainer.evaluate()


  metric = load_metric("seqeval")


In [12]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    # Use 'macro' or 'weighted' for multi-class tasks
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': acc
    }



In [8]:
# from transformers import DataCollatorForTokenClassification

# # Create a data collator that will dynamically pad the inputs and labels
# data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# # Select a very small subset for the zero-shot experiment
# small_train_dataset = tokenized_datasets["train"].select(range(5))  # Select 5 samples for training

# # Trainer for zero-shot
# trainer_zero_shot = Trainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=small_train_dataset,  # Use the small dataset
#     eval_dataset=tokenized_datasets["validation"]
# )

# # Set learning rate to zero
# for param_group in trainer_zero_shot.optimizer.param_groups:
#     param_group['lr'] = 0.0

# # Train and evaluate
# trainer_zero_shot.train()
# results_zero_shot = trainer_zero_shot.evaluate()


In [None]:
# import matplotlib.pyplot as plt
# from torch.utils.tensorboard import SummaryWriter

# # Initialize TensorBoard writer
# writer = SummaryWriter()

# # Add scalars for TensorBoard
# for epoch in range(num_epochs):
#     # Perform training and evaluation steps
#     # ...

#     # Log metrics
#     writer.add_scalar('Loss/train', train_loss, epoch)
#     writer.add_scalar('Accuracy/train', train_accuracy, epoch)
#     writer.add_scalar('Loss/eval', eval_loss, epoch)
#     writer.add_scalar('Accuracy/eval', eval_accuracy, epoch)

# writer.close()

# # Plotting with matplotlib
# plt.plot(train_losses, label='Training loss')
# plt.plot(eval_losses, label='Validation loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

In [13]:
# Few-shot
trainer_few_shot = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(100)),  # Select 100 samples for training
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    
)
trainer_few_shot.train()
trainer_few_shot.evaluate()

  0%|          | 0/12 [00:00<?, ?it/s]

{'loss': 0.5054, 'learning_rate': 1e-05, 'epoch': 0.25}


KeyboardInterrupt: 

In [10]:
trainer_few_shot.save_model("./models/few_shot")

In [None]:
# Fine-tuning
trainer_fine_tuning = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # Use the full training set
    eval_dataset=tokenized_datasets["validation"]
)
trainer_fine_tuning.train()
trainer_fine_tuning.evaluate()

## Inference

In [11]:
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.nn.functional import softmax
import torch

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(dataset['train'].features['ner_tags'].feature.names)).to(device)

# Function for NER Inference
def predict_ner(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=True).to(device)
    print(inputs)

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    print(predictions.cpu().numpy())
    preds  = [ [dataset['train'].features['ner_tags'].feature.names[p] for p in prediction] for prediction in predictions.cpu().numpy() ]

    return preds

# Sample text
sample_text = "Hello, my name is John and I live in New York"
sample_text = [sample_text]

# Perform inference
ner_predictions = predict_ner(sample_text, model, tokenizer)

print(ner_predictions)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[ 101, 7592, 1010, 2026, 2171, 2003, 2198, 1998, 1045, 2444, 1999, 2047,
         2259,  102]], device='mps:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')}
[[1 0 4 4 5 5 5 4 1 5 3 4 4 8]]
[['B-PER', 'O', 'I-ORG', 'I-ORG', 'B-LOC', 'B-LOC', 'B-LOC', 'I-ORG', 'B-PER', 'B-LOC', 'B-ORG', 'I-ORG', 'I-ORG', 'I-MISC']]


# Modify Architecture

In [None]:
from transformers import BertConfig

# Modify the number of layers, attention heads, and hidden units
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=len(dataset['train'].features['ner_tags'].feature.names),
                                    num_hidden_layers=6,  # Change number of layers
                                    num_attention_heads=8,  # Change number of attention heads
                                    hidden_size=512)  # Change hidden units size

model_modified = BertForTokenClassification(config)


# Diff Loss Functions and optimizers

In [None]:
from torch.optim import Adam, SGD

# Modify optimizers
optimizer = AdamW(model.parameters(), lr=5e-5)  # You can switch to Adam, SGD, etc.

# Modify training arguments, including loss function if needed
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=5e-5,  # You can experiment with different learning rates
    weight_decay=0.01,
    # Add more parameters as needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    optimizers=(optimizer, None)  # (Optimizer, Scheduler)
)


# Layer Freezing

In [None]:
# Freeze layers of the model
for param in model.bert.encoder.layer[:6].parameters():  # Freeze the first 6 layers
    param.requires_grad = False


In [None]:
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter()

# Add scalars for TensorBoard
for epoch in range(num_epochs):
    # Perform training and evaluation steps
    # ...

    # Log metrics
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Accuracy/train', train_accuracy, epoch)
    writer.add_scalar('Loss/eval', eval_loss, epoch)
    writer.add_scalar('Accuracy/eval', eval_accuracy, epoch)

writer.close()

# Plotting with matplotlib
plt.plot(train_losses, label='Training loss')
plt.plot(eval_losses, label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


# Old Code

In [4]:

from transformers import BertTokenizerFast

def load_and_preprocess_data(dataset_name='conll2003'):
    # Load dataset
    dataset = load_dataset(dataset_name)
    # dataset["train"] = dataset["train"].select(range(10000))
    # dataset["validation"] = dataset["validation"].select(range(3000))
    # dataset["test"] = dataset["test"].select(range(3000))
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
        labels = []
        
        for i, label in enumerate(examples['ner_tags']):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)  # Special token
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)  # Word continuation
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    # Apply function to each split
    dataset = dataset.map(tokenize_and_align_labels, batched=True)
    return dataset




In [5]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    # Separate the input ids and labels
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    # Pad the sequences
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 is the default ignore index in PyTorch cross-entropy loss

    # Create attention masks
    attention_masks = (input_ids != tokenizer.pad_token_id).long()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }


In [6]:
dataset = load_and_preprocess_data()

In [7]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# inputs    = tokenizer(sentence, return_tensors="pt").to(device)
# model     = model.to(device)
# outputs   = model(**inputs)

In [8]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=dataset['train'].features['ner_tags'].feature.num_classes).to(device)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print(model.device)

mps:0


In [10]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [11]:
# train_loader = DataLoader(dataset['train'][:14000])

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [13]:


def train(model, train_loader, learning_rate=0.001, epochs=3):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_loader = DataLoader(dataset['train'], batch_size=16, shuffle=True, collate_fn=collate_fn)

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        model.train()
        for batch in train_loader:
            # Move batch to the same device as model
            batch = {k: v.to(model.device) for k, v in batch.items()}
            
            optimizer.zero_grad()
            output = model(**batch)
            loss = output.loss
            loss.backward()
            optimizer.step()
            print(f'Loss: {loss.item()}')


train(model, dataset)




Epoch 1/3
Loss: 2.2275941371917725


KeyboardInterrupt: 