# NLP Assignment 1

## Import Libraries

In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer
import numpy as np


Setting torch backend (MPS for Mac, Cuda for Linux)

In [3]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    print ("MPS device not found.")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Loading Data

Load the dataset using `load_dataset` function from datasets library

In [4]:
dataset = load_dataset("conll2003")

In [6]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

initialize the tokenizer using `BertTokenizerFast.from_pretrained` function from transformers library

In [4]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

define the `tokenize_and_align_labels` function to encode the data using the tokenizer

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=False, is_split_into_words=True, padding="max_length", max_length=256)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

call the `tokenize_and_align_labels` function to encode the data

In [6]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets, batch_size=8)

define label_list

In [7]:
label2id = {'O':0, 'B-PER':1, 'I-PER':2, 'B-ORG':3, 'I-ORG':4, 'B-LOC':5, 'I-LOC':6, 'B-MISC':7, 'I-MISC':8}
id2label = {0:'O', 1:'B-PER', 2:'I-PER', 3:'B-ORG', 4:'I-ORG', 5:'B-LOC', 6:'I-LOC', 7:'B-MISC', 8:'I-MISC'}

## Model Architecture

define the base `BERT` model using `BertForTokenClassification.from_pretrained` function from transformers library

In [8]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', 
                                                   num_labels=len(dataset['train'].features['ner_tags'].feature.names), 
                                                   id2label=id2label,
                                                   label2id=label2id).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


define the `compute_metrics` function to compute the metrics

In [None]:
metric = load_metric("seqeval")
def compute_metrics(p):
	predictions, labels = p
	predictions = np.argmax(predictions, axis=2)
	true_predictions = [
		[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
		for prediction, label in zip(predictions, labels)
	]
	true_labels = [
		[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
		for prediction, label in zip(predictions, labels)
	]
	results = metric.compute(predictions=true_predictions, references=true_labels)
	return {
		"precision": results["overall_precision"],
		"recall": results["overall_recall"],
		"f1": results["overall_f1"],
		"accuracy": results["overall_accuracy"],
	}

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

### Zero Shot Learning

In [10]:
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.nn.functional import softmax
import torch

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(dataset['train'].features['ner_tags'].feature.names)).to(device)

# Function for NER Inference
def predict_ner(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=True).to(device)
    print(inputs)

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    print(predictions.cpu().numpy())
    preds  = [ [dataset['train'].features['ner_tags'].feature.names[p] for p in prediction] for prediction in predictions.cpu().numpy() ]

    return preds

# Sample text
sample_text = "Hello, my name is John and I live in New York"
sample_text = [sample_text]

# Perform inference
ner_predictions = predict_ner(sample_text, model, tokenizer)

print(ner_predictions)


### Few Shot Learning

In [None]:
# Few-shot
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_few_shot',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_few_shot',
    logging_steps=1,
    learning_rate=1e-3,
)


trainer_few_shot = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(100)),  # Select 100 samples for training
    eval_dataset=tokenized_datasets["test"],
    learning_rate=1e-3,
    compute_metrics=compute_metrics,
    
)
trainer_few_shot.train()
trainer_few_shot.evaluate()

In [None]:
trainer_few_shot.save_model("./models/few_shot")

### Fine Tuning

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
# Fine-tuning

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_fine_tuning',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_fine_tuning',
    logging_steps=1,
    learning_rate=1e-3,
)

trainer_fine_tuning = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # Use the full training set
    eval_dataset=tokenized_datasets["test"]
)
trainer_fine_tuning.train()
trainer_fine_tuning.evaluate()

In [None]:
trainer_few_shot.save_model("./models/fine_tuning")

## Inference

In [None]:
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.nn.functional import softmax
import torch

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(dataset['train'].features['ner_tags'].feature.names)).to(device)

# Function for NER Inference
def predict_ner(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=True).to(device)
    print(inputs)

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    print(predictions.cpu().numpy())
    preds  = [ [dataset['train'].features['ner_tags'].feature.names[p] for p in prediction] for prediction in predictions.cpu().numpy() ]

    return preds

# Sample text
sample_text = "Hello, my name is John and I live in New York"
sample_text = [sample_text]

# Perform inference
ner_predictions = predict_ner(sample_text, model, tokenizer)

print(ner_predictions)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[ 101, 7592, 1010, 2026, 2171, 2003, 2198, 1998, 1045, 2444, 1999, 2047,
         2259,  102]], device='mps:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')}
[[1 0 4 4 5 5 5 4 1 5 3 4 4 8]]
[['B-PER', 'O', 'I-ORG', 'I-ORG', 'B-LOC', 'B-LOC', 'B-LOC', 'I-ORG', 'B-PER', 'B-LOC', 'B-ORG', 'I-ORG', 'I-ORG', 'I-MISC']]


# Modify Architecture

In [None]:
from transformers import BertConfig

# Modify the number of layers, attention heads, and hidden units
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=len(dataset['train'].features['ner_tags'].feature.names),
                                    num_hidden_layers=6,  # Change number of layers
                                    num_attention_heads=8,  # Change number of attention heads
                                    hidden_size=512)  # Change hidden units size

model_modified = BertForTokenClassification(config)


# Diff Loss Functions and optimizers

In [None]:
from torch.optim import Adam, SGD

# Modify optimizers
optimizer = AdamW(model.parameters(), lr=5e-5)  # You can switch to Adam, SGD, etc.

# Modify training arguments, including loss function if needed
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=5e-5,  # You can experiment with different learning rates
    weight_decay=0.01,
    # Add more parameters as needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    optimizers=(optimizer, None)  # (Optimizer, Scheduler)
)


# Layer Freezing

In [None]:
# Freeze layers of the model
for param in model.bert.encoder.layer[:6].parameters():  # Freeze the first 6 layers
    param.requires_grad = False


# Old Code

In [None]:

from transformers import BertTokenizerFast

def load_and_preprocess_data(dataset_name='conll2003'):
    # Load dataset
    dataset = load_dataset(dataset_name)
    # dataset["train"] = dataset["train"].select(range(10000))
    # dataset["validation"] = dataset["validation"].select(range(3000))
    # dataset["test"] = dataset["test"].select(range(3000))
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
        labels = []
        
        for i, label in enumerate(examples['ner_tags']):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)  # Special token
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)  # Word continuation
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    # Apply function to each split
    dataset = dataset.map(tokenize_and_align_labels, batched=True)
    return dataset




In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    # Separate the input ids and labels
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    # Pad the sequences
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 is the default ignore index in PyTorch cross-entropy loss

    # Create attention masks
    attention_masks = (input_ids != tokenizer.pad_token_id).long()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }


In [None]:
dataset = load_and_preprocess_data()

In [None]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# inputs    = tokenizer(sentence, return_tensors="pt").to(device)
# model     = model.to(device)
# outputs   = model(**inputs)

In [None]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=dataset['train'].features['ner_tags'].feature.num_classes).to(device)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(model.device)

mps:0


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [None]:
# train_loader = DataLoader(dataset['train'][:14000])

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [None]:


def train(model, train_loader, learning_rate=0.001, epochs=3):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_loader = DataLoader(dataset['train'], batch_size=16, shuffle=True, collate_fn=collate_fn)

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        model.train()
        for batch in train_loader:
            # Move batch to the same device as model
            batch = {k: v.to(model.device) for k, v in batch.items()}
            
            optimizer.zero_grad()
            output = model(**batch)
            loss = output.loss
            loss.backward()
            optimizer.step()
            print(f'Loss: {loss.item()}')


train(model, dataset)




Epoch 1/3
Loss: 2.2275941371917725


KeyboardInterrupt: 