## CS310 Natural Language Processing
## Assignment 3 (part 2). Named Entity Recognition with Bi-LSTM

**Total points**: 30 + 20 bonus points

In this assignment, you will train a bidirectional LSTM model on the CoNLL2003 English named entity recognition task set and evaluate its performance.

For the bonus questions, submit them as separate notebook files.

### 0. Import Necessary Libraries

In [55]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score

### 2. Build the Model

In [56]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def read_ner_data(path_to_file):
    words = []
    tags = []
    with open(path_to_file, 'r', encoding='utf-8') as file:
        for line in file:
            splitted = line.split()
            if len(splitted) == 0:
                continue
            word = splitted[0]
            if word == '-DOCSTART-':
                continue
            entity = splitted[-1]
            words.append(word)
            tags.append(entity)
        return words, tags

TRAIN_PATH = 'data/train.txt'
DEV_PATH = 'data/dev.txt'
TEST_PATH = 'data/test.txt'


train_words, train_tags = read_ner_data(TRAIN_PATH)
dev_words, dev_tags = read_ner_data(DEV_PATH)
test_words, test_tags = read_ner_data(TEST_PATH)


# Convert all words to lowercase
train_words = [word.lower() for word in train_words]
dev_words = [word.lower() for word in dev_words]
test_words = [word.lower() for word in test_words]

# Build vocabularies for words and labels
word_vocab = set(train_words + dev_words + test_words)
label_vocab = set(train_tags + dev_tags + test_tags)

print('Word vocabulary size:', len(word_vocab))
print('Tag vocabulary size:', len(label_vocab))

# Define mappings from words and labels to indices
word2idx = {word: idx for idx, word in enumerate(word_vocab)}
label2idx = {label: idx for idx, label in enumerate(label_vocab)}

# Define a data loader that returns batches
def collate_fn(batch):
    sentences, labels = zip(*batch)
    sentence_lengths = [len(sentence) for sentence in sentences]
    max_length = max(sentence_lengths)
    padded_sentences = []
    for sentence in sentences:
        padded_sentence = [word2idx[word] for word in sentence]
        padded_sentence += [0] * (max_length - len(sentence))
        padded_sentences.append(padded_sentence)
    return torch.LongTensor(padded_sentences), torch.LongTensor(labels), torch.LongTensor(sentence_lengths)

# Create data loaders
train_data = list(zip(train_words, train_tags))
dev_data = list(zip(dev_words, dev_tags))
test_data = list(zip(test_words, test_tags))

train_dataset = CustomDataset(train_data)
dev_dataset = CustomDataset(dev_data)
test_dataset = CustomDataset(test_data)

batch_size = 32  # Set your desired batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [57]:


embedding_dim = 100  # Set the desired embedding dimension
embedding_file = 'glove.6B/glove.6B.100d.txt'  # Path to the pretrained embedding file

# Load the pretrained embeddings
pretrained_embeddings = {}
with open(embedding_file, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = torch.tensor([float(val) for val in values[1:]])
        pretrained_embeddings[word] = vector

# Initialize the embedding layer with pretrained embeddings
num_embeddings = len(word_vocab)
embedding_matrix = torch.zeros(num_embeddings, embedding_dim)
for word, idx in word2idx.items():
    if word in pretrained_embeddings:
        embedding_matrix[idx] = pretrained_embeddings[word]

# Now you can use the embedding_matrix to initialize your embedding layer in the model
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

In [58]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, num_classes):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        lstm_output, _ = self.lstm(embedded)
        # Use the last hidden states from both directions
        last_hidden = torch.cat((lstm_output[:, -1, :hidden_dim], lstm_output[:, 0, hidden_dim:]), dim=1)
        logits = self.fc(last_hidden)
        return logits

# Set hyperparameters
embedding_dim = 100
hidden_dim = 256
num_layers = 2
num_classes = len(label_vocab)

# Instantiate the model
model = BiLSTMClassifier(embedding_dim, hidden_dim, num_layers, num_classes)

### 3. Train and Evaluate

In [59]:


# Set training parameters
learning_rate = 0.001
weight_decay = 0.001  # L2 regularization parameter
num_epochs = 10

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Training loop
model.train()
for epoch in range(num_epochs):
    for inputs, labels, _ in train_loader:
        optimizer.zero_grad()
        logits = model(inputs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for inputs, labels, _ in test_loader:
        logits = model(inputs)
        _, predicted = torch.max(logits, dim=1)
        predictions.extend(predicted.tolist())
        true_labels.extend(labels.tolist())

# Calculate F1 score
f1 = f1_score(true_labels, predictions, average='weighted')

# Print F1 score on test set
print("F1 Score on Test Set: {:.4f}".format(f1))

# Save the trained model
torch.save(model.state_dict(), 'model.pth')

ValueError: too many dimensions 'str'