In [1]:
# Assumptions as per post 519:
# 1) conlleval.py present in the directory
# 2) glove.6B.100d.txt present in the directory

In [2]:
!pip3 install torch torchvision torchaudio
!pip install numpy
!pip install -q datasets
!pip install scikit-learn



In [3]:
!pip freeze > requirements.txt

In [4]:
import itertools
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam, AdamW
import numpy as np
from conlleval import evaluate
import datasets

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Load the dataset
dataset = datasets.load_dataset("conll2003")

In [7]:
# Calculating the word frequency
word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))
# Creating a dictionary with words having frequency greater than 2
word_frequency = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

# Adding the index and UNK and PAD to handle padding and unknown tokens:
word2idx = {
    word: index
    for index, word in enumerate(word_frequency.keys(), start=2)
}
word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

In [8]:
# Iterating the dataset to replace unknown tokens with [UNK]
dataset = (
    dataset
    .map(lambda x: {
            'input_ids': [
                word2idx.get(word, word2idx['[UNK]'])
                for word in x['tokens']
            ]
        }
    )
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids'],
        num_rows: 3453
    })
})

In [10]:
# Removing columns pos_tags & chunk_tags; Renaming column ner_tags to labels
for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(['pos_tags', 'chunk_tags'])
    dataset[split] = dataset[split].rename_column('ner_tags', 'labels')

In [11]:
vocab_size = len(word2idx)
print(vocab_size)

8128


In [12]:
# NER tag mapping
ner_tags = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8, '[PAD]': 9}
idx2tag = {idx: tag for tag, idx in ner_tags.items()}

**Task 1**

In [13]:
# Model Architecture:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, output_dim):
        dropout_val = 0.33
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim, num_layers = 1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout_val)
        self.linear = nn.Linear(lstm_hidden_dim*2, output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, len(ner_tags)-1)

    def forward(self, x, labels=None):
        embed = self.embedding(x)
        lstm_out, _ = self.bilstm(embed)
        drop = self.dropout(lstm_out)
        linear = self.linear(drop)
        elu_out = self.elu(linear)
        logits = self.classifier(elu_out)
        loss = None
        if labels is not None:
            logits_flatten = logits.view(-1, logits.shape[-1])
            labels_flatten = labels.view(-1)
            loss = nn.functional.cross_entropy(logits_flatten, labels_flatten,ignore_index=9)

        return logits, loss

In [14]:
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [15]:
def collate_fun(batch):

    input_ids = [torch.tensor(item['input_ids'], device=device) for item in batch]
    labels = [torch.tensor(item['labels'], device=device) for item in batch]
    lengths = [len(label) for label in labels]

    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=9)
    return {'input_ids': input_ids_padded, 'labels': labels_padded, 'lengths': lengths}

batch_size = 32
train_loader = DataLoader(train_data, batch_size= batch_size, collate_fn = collate_fun)
test_loader = DataLoader(test_data, batch_size= batch_size, collate_fn = collate_fun)
validation_loader = DataLoader(validation_data, batch_size= batch_size, collate_fn = collate_fun)

In [16]:
model = BiLSTM(vocab_size=vocab_size, embedding_dim=100, lstm_hidden_dim=256, output_dim=128).to(device)
optimizer = AdamW(model.parameters(), lr=0.001)
best_val_f1 = 0.0

In [17]:
num_epochs = 20  # Number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_f1 = 0
    for data in train_loader:
        input_ids, labels, lengths = data['input_ids'], data['labels'], data['lengths']
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        logits, loss = model(input_ids, labels)

        if loss is not None:
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        predictions = torch.argmax(logits, dim=-1).view(-1)
        labels_flat = labels.view(-1)

    model.eval()
    valid_loss, valid_f1 = 0, 0
    all_val_predictions, all_val_labels = [], []

    with torch.no_grad():
        for data in validation_loader:

            input_ids, labels, lengths = data['input_ids'], data['labels'], data['lengths']
            # print(input_ids, '\n', '----------------------------', labels)
            logits, loss = model(input_ids, labels)
            valid_loss += loss.item()

            val_predictions = torch.argmax(logits, dim=-1).tolist()

            for i, length in enumerate(lengths):
              seq_preds = val_predictions[i][:length]
              seq_labels = labels[i, :length].tolist()
              mapped_seq_preds = [idx2tag[p] for p in seq_preds]
              mapped_seq_labels = [idx2tag[l] for l in seq_labels]

              all_val_predictions.extend(mapped_seq_preds)
              all_val_labels.extend(mapped_seq_labels)

        flat_preds = list(itertools.chain(*all_val_predictions))
        flat_labels = list(itertools.chain(*all_val_labels))
        precision, recall, f1 = evaluate(all_val_labels, all_val_predictions)


        print(f"Epoch: {epoch}, Train Loss: {total_loss / len(train_loader)}")
        print(f"Epoch: {epoch}, Validation Loss: {valid_loss / len(validation_loader)}")

        # Saving best model based on best F1 score
        if f1 > best_val_f1:
            print(f'Validation F1 increased ({best_val_f1:.4f} --> {f1:.4f}). Saving model...')
            torch.save(model.state_dict(), 'model.pt')
            best_val_f1 = f1

processed 51362 tokens with 5942 phrases; found: 3177 phrases; correct: 1923.
accuracy:  34.77%; (non-O)
accuracy:  88.70%; precision:  60.53%; recall:  32.36%; FB1:  42.18
              LOC: precision:  72.06%; recall:  49.70%; FB1:  58.83  1267
             MISC: precision:  53.37%; recall:  10.30%; FB1:  17.27  178
              ORG: precision:  41.58%; recall:  30.20%; FB1:  34.99  974
              PER: precision:  67.28%; recall:  27.69%; FB1:  39.23  758
Epoch: 0, Train Loss: 0.6636327108991988
Epoch: 0, Validation Loss: 0.3902442083493167
Validation F1 increased (0.0000 --> 42.1757). Saving model...
processed 51362 tokens with 5942 phrases; found: 4331 phrases; correct: 3154.
accuracy:  56.18%; (non-O)
accuracy:  92.38%; precision:  72.82%; recall:  53.08%; FB1:  61.40
              LOC: precision:  83.06%; recall:  64.34%; FB1:  72.52  1423
             MISC: precision:  78.18%; recall:  49.35%; FB1:  60.51  582
              ORG: precision:  55.31%; recall:  48.92%; FB1:  51.

In [18]:
# Model architechture:
print(model)

BiLSTM(
  (embedding): Embedding(8128, 100)
  (bilstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)


Hyperparameters:

1) Number Of Epochs: 20

2) Optimizer: AdamW

3) Learning rate: 0.001

4) Best Model saved based on F1 score

5) Dropout: 0.33

6) Vocab size: 8128

### **Results on Validation data**

In [19]:
# Model on validation set
model.load_state_dict(torch.load('model.pt'))
model.eval()
preds = []
label_list = []
with torch.no_grad():
    for data in validation_loader:
        input_ids, labels, lengths = data['input_ids'], data['labels'], data['lengths']
        logits, loss = model(input_ids, labels)
        predictions = torch.argmax(logits, dim=2)

        for pred, label, length in zip(predictions.tolist(), labels.tolist(), lengths):
            decoded_label = [idx2tag[l] for l in label]
            label_list.extend([decoded_label[:length]])
            trimmed_pred = pred[:length]
            decoded_pred = [idx2tag[p] for p in trimmed_pred]
            preds.extend([decoded_pred])

flat_preds = list(itertools.chain(*preds))
flat_labels = list(itertools.chain(*label_list))
precision, recall, f1 = evaluate(flat_labels, flat_preds)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

processed 51362 tokens with 5942 phrases; found: 5431 phrases; correct: 4481.
accuracy:  78.07%; (non-O)
accuracy:  95.85%; precision:  82.51%; recall:  75.41%; FB1:  78.80
              LOC: precision:  90.90%; recall:  82.14%; FB1:  86.30  1660
             MISC: precision:  83.31%; recall:  74.73%; FB1:  78.79  827
              ORG: precision:  74.74%; recall:  69.50%; FB1:  72.02  1247
              PER: precision:  79.61%; recall:  73.34%; FB1:  76.35  1697
Precision: 82.50782544651078
Recall: 75.41231908448334
F1 Score: 78.80066824936252


In [20]:
# What are the precision, recall, and F1 score on the validation data?.
# precision:  82.51%; recall:  75.41%; FB1:  78.80

### **Results on test data**

In [21]:
# Model on test set
model.load_state_dict(torch.load('model.pt'))
model.eval()
preds = []
label_list = []
with torch.no_grad():
    for data in test_loader:
        input_ids, labels, lengths = data['input_ids'], data['labels'], data['lengths']
        logits, loss = model(input_ids, labels)
        predictions = torch.argmax(logits, dim=2)

        for pred, label, length in zip(predictions.tolist(), labels.tolist(), lengths):
            decoded_label = [idx2tag[l] for l in label]
            label_list.extend([decoded_label[:length]])
            trimmed_pred = pred[:length]
            decoded_pred = [idx2tag[p] for p in trimmed_pred]
            preds.extend([decoded_pred])

flat_preds = list(itertools.chain(*preds))
flat_labels = list(itertools.chain(*label_list))
precision, recall, f1 = evaluate(flat_labels, flat_preds)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

processed 46435 tokens with 5648 phrases; found: 4981 phrases; correct: 3709.
accuracy:  70.38%; (non-O)
accuracy:  93.86%; precision:  74.46%; recall:  65.67%; FB1:  69.79
              LOC: precision:  85.47%; recall:  74.04%; FB1:  79.34  1445
             MISC: precision:  65.77%; recall:  62.68%; FB1:  64.19  669
              ORG: precision:  70.23%; recall:  59.66%; FB1:  64.52  1411
              PER: precision:  71.63%; recall:  64.50%; FB1:  67.88  1456
Precision: 74.4629592451315
Recall: 65.66926345609065
F1 Score: 69.79019663185625


In [22]:
# What are the precision, recall, and F1 score on the test data?.
# precision:  74.46%; recall:  65.67%; FB1:  69.79

**Task 2**

In [23]:
vocab, embeddings = [], []
with open('glove.6B.100d.txt', 'rt', encoding='utf-8') as fi:
    full_content = fi.read().strip().split('\n')

for line in full_content:
    parts = line.split(' ')
    word = parts[0]
    embedding = [float(val) for val in parts[1:]]
    vocab.append(word)
    embeddings.append(embedding)

vocab = ['[PAD]', '[UNK]'] + vocab
pad_emb_npa = np.zeros((1, 100))  # embedding for '<pad>' token
unk_emb_npa = np.mean(embeddings, axis=0, keepdims=True)  # embedding for '<unk>' token

# Insert embeddings for pad and unk tokens at the top of embs_npa.
embs_npa = np.vstack((pad_emb_npa, unk_emb_npa, embeddings))

vocab_npa = np.array(vocab)
embs_npa = np.array(embs_npa)

print(len(embs_npa))
print(len(vocab_npa))

400002
400002


In [24]:
vocab_size = len(vocab_npa)

In [25]:
vocab_npa

array(['[PAD]', '[UNK]', 'the', ..., 'rolonda', 'zsombor', 'sandberger'],
      dtype='<U68')

In [26]:
dataset_glove = datasets.load_dataset("conll2003")

word_frequency = Counter()

word2idx_glove = {
    word: index
    for index, word in enumerate(vocab_npa)
}

# Iterating the dataset to replace unknown tokens with [UNK]
dataset_glove = (
    dataset_glove
    .map(lambda x: {
            'input_ids': [
                word2idx_glove.get(word.lower(), word2idx_glove['[UNK]'])
                for word in x['tokens']
            ]
        }
    )
)

# Removing columns pos_tags & chunk_tags; Renaming column ner_tags to labels
for split in dataset_glove.keys():
    dataset_glove[split] = dataset_glove[split].remove_columns(['pos_tags', 'chunk_tags'])
    dataset_glove[split] = dataset_glove[split].rename_column('ner_tags', 'labels')

train_data = dataset_glove['train']
validation_data = dataset_glove['validation']
test_data = dataset_glove['test']

In [27]:
# Model Architecture
class BiLSTMGlove(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, output_dim):
        super(BiLSTMGlove, self).__init__()
        droupout_val = 0.33

        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float(), freeze=True)
        self.upper_embedding = nn.Embedding(2,10)
        self.lower_embedding = nn.Embedding(2,10)
        self.title_embedding = nn.Embedding(2,10)

        self.bilstm = nn.LSTM(embedding_dim+30, lstm_hidden_dim, num_layers = 1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(droupout_val)
        self.linear = nn.Linear(lstm_hidden_dim*2, output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, len(ner_tags)-1)

    def forward(self, x, is_upper, lower_case, title_case, labels=None):
        embed = self.embedding(x)
        upper = self.upper_embedding(is_upper)
        lower = self.lower_embedding(lower_case)
        title = self.lower_embedding(title_case)
        features = torch.cat((embed, upper, lower, title), dim=-1)
        lstm_out, _ = self.bilstm(features)
        drop = self.dropout(lstm_out)
        linear = self.linear(drop)
        elu_out = self.elu(linear)
        logits = self.classifier(elu_out)

        loss = None
        if labels is not None:
            logits_flatten = logits.view(-1, logits.shape[-1])
            labels_flatten = labels.view(-1)
            loss = nn.functional.cross_entropy(logits_flatten, labels_flatten,ignore_index=9)

        return logits, loss

In [28]:
def collate_fun_glove(batch):

    input_ids = [torch.tensor(item['input_ids'], device=device) for item in batch]
    labels = [torch.tensor(item['labels'], device=device) for item in batch]
    lengths = [len(label) for label in labels]

    # Additional features
    upper_case = [torch.tensor([1 if word.isupper() else 0 for word in item['tokens']], dtype=torch.long,device=device) for item in batch]
    lower_case = [torch.tensor([1 if word.islower() else 0 for word in item['tokens']], dtype=torch.long,device=device) for item in batch]
    title_case = [torch.tensor([1 if word.istitle() else 0 for word in item['tokens']], dtype=torch.long,device=device) for item in batch]

    upper_case_padded = torch.nn.utils.rnn.pad_sequence(upper_case, batch_first=True, padding_value=0)
    lower_case_padded = torch.nn.utils.rnn.pad_sequence(lower_case, batch_first=True, padding_value=0)
    title_case_padded = torch.nn.utils.rnn.pad_sequence(title_case, batch_first=True, padding_value=0)

    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=9)

    return {'input_ids': input_ids_padded, 'labels': labels_padded, 'lengths': lengths, 'upper_case': upper_case_padded, 'lower_case': lower_case_padded,
            'title_case': title_case_padded}

batch_size = 32
train_loader = DataLoader(train_data, batch_size= batch_size, collate_fn = collate_fun_glove)
test_loader = DataLoader(test_data, batch_size= batch_size, collate_fn = collate_fun_glove)
validation_loader = DataLoader(validation_data, batch_size= batch_size, collate_fn = collate_fun_glove)

In [29]:
model_glove = BiLSTMGlove(vocab_size=vocab_size, embedding_dim=100, lstm_hidden_dim=256, output_dim=128).to(device)
optimizer = AdamW(model_glove.parameters(),lr=0.001)
best_val_f1 = 0.0

In [30]:
num_epochs = 30  # Number of epochs
for epoch in range(num_epochs):
    model_glove.train()
    total_loss = 0
    total_f1 = 0
    for data in train_loader:
        input_ids, labels, lengths, upper_case, lower_case, title_case = data['input_ids'], data['labels'], data['lengths'], data['upper_case'], data['lower_case'], data['title_case']
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        logits, loss = model_glove(input_ids, upper_case, lower_case, title_case, labels)

        if loss is not None:
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            total_loss += loss.item()

        predictions = torch.argmax(logits, dim=-1).view(-1)
        labels_flat = labels.view(-1)

    model_glove.eval()
    valid_loss, valid_f1 = 0, 0
    all_val_predictions, all_val_labels = [], []

    with torch.no_grad():
        for data in validation_loader:
            input_ids, labels, lengths, upper_case, lower_case, title_case = data['input_ids'], data['labels'], data['lengths'], data['upper_case'], data['lower_case'], data['title_case']
            logits, loss = model_glove(input_ids,  upper_case,  lower_case, title_case, labels)
            valid_loss += loss.item()

            val_predictions = torch.argmax(logits, dim=-1).tolist()

            for i, length in enumerate(lengths):
                seq_preds = val_predictions[i][:length]
                seq_labels = labels[i, :length].tolist()
                mapped_seq_preds = [idx2tag[p] for p in seq_preds]
                mapped_seq_labels = [idx2tag[l] for l in seq_labels]

                all_val_predictions.extend(mapped_seq_preds)
                all_val_labels.extend(mapped_seq_labels)

        flat_preds = list(itertools.chain(*all_val_predictions))
        flat_labels = list(itertools.chain(*all_val_labels))
        precision, recall, f1 = evaluate(all_val_labels, all_val_predictions)


        print(f"Epoch: {epoch}, Train Loss: {total_loss / len(train_loader)}")
        print(f"Epoch: {epoch}, Validation Loss: {valid_loss / len(validation_loader)}")

        if f1 > best_val_f1:
            print(f'Validation F1 increased ({best_val_f1:.4f} --> {f1:.4f}). Saving model...')
            torch.save(model_glove.state_dict(), 'model_glove.pt')
            best_val_f1 = f1

processed 51362 tokens with 5942 phrases; found: 6050 phrases; correct: 4851.
accuracy:  82.66%; (non-O)
accuracy:  96.85%; precision:  80.18%; recall:  81.64%; FB1:  80.90
              LOC: precision:  84.45%; recall:  86.34%; FB1:  85.38  1878
             MISC: precision:  69.18%; recall:  71.58%; FB1:  70.36  954
              ORG: precision:  67.93%; recall:  68.38%; FB1:  68.15  1350
              PER: precision:  90.36%; recall:  91.64%; FB1:  91.00  1868
Epoch: 0, Train Loss: 0.2715232573366875
Epoch: 0, Validation Loss: 0.10504391064922161
Validation F1 increased (0.0000 --> 80.9039). Saving model...
processed 51362 tokens with 5942 phrases; found: 6041 phrases; correct: 5154.
accuracy:  87.25%; (non-O)
accuracy:  97.67%; precision:  85.32%; recall:  86.74%; FB1:  86.02
              LOC: precision:  88.01%; recall:  93.47%; FB1:  90.65  1951
             MISC: precision:  73.92%; recall:  79.93%; FB1:  76.81  997
              ORG: precision:  78.57%; recall:  74.65%; FB1:  

In [31]:
# Model Architecture:
print(model_glove)

BiLSTMGlove(
  (embedding): Embedding(400002, 100)
  (upper_embedding): Embedding(2, 10)
  (lower_embedding): Embedding(2, 10)
  (title_embedding): Embedding(2, 10)
  (bilstm): LSTM(130, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)


Hyperparameters

1) Number Of Epochs: 30

2) Optimizer: AdamW

3) Learning rate: 0.001

4) Best Model saved based on F1 score

5) Dropout: 0.33

6) Vocab size: 23625

7) Additional features used: isUpper, isTitle, isLower

Model with Validation data

In [32]:
# Model on validation set
model_glove.load_state_dict(torch.load('model_glove.pt'))
model_glove.eval()
preds = []
label_list = []
with torch.no_grad():
    for data in validation_loader:
        input_ids, labels, lengths, upper_case, lower_case, title_case = data['input_ids'], data['labels'], data['lengths'], data['upper_case'], data['lower_case'], data['title_case']
        logits, loss = model_glove(input_ids,  upper_case,  lower_case, title_case, labels)
        predictions = torch.argmax(logits, dim=2)

        for pred, label, length in zip(predictions.tolist(), labels.tolist(), lengths):
            decoded_label = [idx2tag[l] for l in label]
            label_list.extend([decoded_label[:length]])
            trimmed_pred = pred[:length]
            decoded_pred = [idx2tag[p] for p in trimmed_pred]
            preds.extend([decoded_pred])

flat_preds = list(itertools.chain(*preds))
flat_labels = list(itertools.chain(*label_list))
precision, recall, f1 = evaluate(flat_labels, flat_preds)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

processed 51362 tokens with 5942 phrases; found: 6023 phrases; correct: 5550.
accuracy:  93.72%; (non-O)
accuracy:  98.77%; precision:  92.15%; recall:  93.40%; FB1:  92.77
              LOC: precision:  95.46%; recall:  96.08%; FB1:  95.77  1849
             MISC: precision:  85.25%; recall:  85.25%; FB1:  85.25  922
              ORG: precision:  88.17%; recall:  90.60%; FB1:  89.37  1378
              PER: precision:  95.20%; recall:  96.85%; FB1:  96.02  1874
Precision: 92.14677071226963
Recall: 93.40289464826658
F1 Score: 92.77058086084413


In [37]:
# What are the precision, recall, and F1 score on the validation data?.
# 98.77%; precision:  92.15%; recall:  93.40%; FB1:  92.77

Model with Test Data

In [34]:
# Model on test set
model_glove.load_state_dict(torch.load('model_glove.pt',map_location=torch.device(device)))
model_glove.eval()
preds = []
label_list = []
with torch.no_grad():
    for data in test_loader:
        input_ids, labels, lengths, upper_case, lower_case, title_case = data['input_ids'], data['labels'], data['lengths'], data['upper_case'], data['lower_case'], data['title_case']
        logits, loss = model_glove(input_ids,  upper_case,  lower_case, title_case, labels)
        predictions = torch.argmax(logits, dim=2)

        for pred, label, length in zip(predictions.tolist(), labels.tolist(), lengths):
            decoded_label = [idx2tag[l] for l in label]
            label_list.extend([decoded_label[:length]])
            trimmed_pred = pred[:length]
            decoded_pred = [idx2tag[p] for p in trimmed_pred]
            preds.extend([decoded_pred])

flat_preds = list(itertools.chain(*preds))
flat_labels = list(itertools.chain(*label_list))
precision, recall, f1 = evaluate(flat_labels, flat_preds)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

processed 46435 tokens with 5648 phrases; found: 5785 phrases; correct: 5045.
accuracy:  90.62%; (non-O)
accuracy:  97.70%; precision:  87.21%; recall:  89.32%; FB1:  88.25
              LOC: precision:  89.53%; recall:  92.75%; FB1:  91.11  1728
             MISC: precision:  72.11%; recall:  77.35%; FB1:  74.64  753
              ORG: precision:  84.44%; recall:  86.57%; FB1:  85.49  1703
              PER: precision:  94.75%; recall:  93.82%; FB1:  94.28  1601
Precision: 87.20829732065687
Recall: 89.32365439093485
F1 Score: 88.25330184553486


In [36]:
# What are the precision, recall, and F1 score on the test data?.
# precision:  87.21%; recall:  89.32%; FB1:  88.25

BiLSTM with Glove Embeddings outperforms the model without. Can you
provide a rationale for this?


Glove Embedding is a pretrained word embedding. It is trained on large datasets and captures the semantic and syntactic meaning of the word. Learning word embeddings from scratch is challenging. While learning from scratch on our dataset we will face issue of sparsity on the training data. The vocabulary of our training dataset might not be rich enough. This is overcome by using Glove embeddings. Due to these reasons BiLSTM with Glove Embeddings outperforms the model without Glove Embeddings.