In [None]:
#Importing the required libraries 
import numpy as np
import pandas as pd
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import random
import json
torch.manual_seed(0)
random.seed(0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def prep_train(dataset):
    train_x = list()
    train_y = list()
    x = list() 
    y = list()
    first = 1
    for row in dataset.itertuples():
        if(row.id == '1' and first == 0):
            train_x.append(x)
            train_y.append(y)
            x = list()
            y = list()
        first = 0
        x.append(row.word)
        y.append(row.NER)

    train_x.append(x)
    train_y.append(y)

    return train_x, train_y


def read_file(path):
    train_df = list()
    with open(path, 'r') as f:
        for line in f.readlines():
            if len(line) > 2:
                id, word, ner_tag = line.strip().split(" ")
                train_df.append([id, word, ner_tag])

    train_df = pd.DataFrame(train_df, columns=['id', 'word', 'NER'])
    train_df = train_df.dropna()
    train_x, train_y = prep_train(train_df)
    return train_x, train_y

In [None]:
def prep_test(dataset):
    train_x = list()
    x = list()
    first = 1
    for row in dataset.itertuples():
        if(row.id == '1' and first == 0):
            train_x.append(x)
            x = list()
        first = 0
        x.append(row.word)

    train_x.append(x)
    return train_x


def read_test(path):
    train_df = list()
    with open(path, 'r') as f:
        for line in f.readlines():
            if len(line) > 1:
                id, word = line.strip().split(" ")
                train_df.append([id, word])

    train_df = pd.DataFrame(train_df, columns=['id', 'word'])
    train_df = train_df.dropna()
    train_x = prep_test(train_df)
    return train_x

In [None]:
train_x, train_y = read_file('./data/train')
val_x, val_y = read_file('./data/dev')
test_x = read_test('./data/test')

##BiLSTM: <br>
The model takes as input a sequence of tokens represented as integers, and their corresponding lengths. It then passes the sequence through an embedding layer to obtain a dense representation for each token. The embedded sequence is then passed through a BiLSTM layer to capture the contextual information of each token. The output of the BiLSTM is passed through a linear layer followed by a non-linear activation function and another linear layer to produce the final output, which is a probability distribution over the set of tags.

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim, lstm_layers,
                 bidirectional, dropout_val, num_classes):
        super(BiLSTM, self).__init__()
        #Hyper Parameters
        #hidden_dim = 256
        self.hidden_dim = hidden_dim 
        #LSTM Layers = 1
        self.lstm_layers = lstm_layers
        #Embedding Dimension = 100
        self.embedding_dim = embedding_dim
        #Linear Ouput Dimension = 128
        self.output_dim = output_dim
        #the number of possible tags in the output
        self.num_classes = num_classes
        self.num_directions = 2 if bidirectional else 1

        #Creating Network
        #Embedding Layer
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim)
        self.embedding.weight.data.uniform_(-1,1)
        self.LSTM = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        self.fc = nn.Linear(hidden_dim*self.num_directions,
                            output_dim)
        self.dropout = nn.Dropout(dropout_val)
        self.elu = nn.ELU(alpha=0.01)
        self.classifier = nn.Linear(output_dim, self.num_classes)

    def init_hidden(self, batch_size):
        h, c = (torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device),
                torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device))
        return h, c

    def forward(self, input_seq, seq_len):
        # Set initial states
        batch_size = input_seq.shape[0]
        h_0, c_0 = self.init_hidden(batch_size)
        #Forward pass LSTM
        #Embedding Layer
        embedded = self.embedding(input_seq).float()
        #LSTM layer
        packed_embedded = pack_padded_sequence(
            embedded, seq_len, batch_first=True, enforce_sorted=False)
        output, _ = self.LSTM(packed_embedded, (h_0, c_0))
        output_unpacked, _ = pad_packed_sequence(output, batch_first=True)
        #Output Layers
        dropout = self.dropout(output_unpacked)
        lin = self.fc(dropout)
        pred = self.elu(lin)
        pred = self.classifier(pred)
        return pred

##BiLSTM Glove: <br>
The network consists of an embedding layer that converts the integer representations of the input tokens to dense vectors, a bidirectional LSTM layer that processes the embedded sequence, a fully connected layer that applies a linear transformation to the output of the LSTM layer, a dropout layer that randomly sets some of the activations to zero during training to prevent overfitting, an ELU (Exponential Linear Unit) activation layer that applies the element-wise function f(x) = alpha * (exp(x) - 1) for x < 0 and f(x) = x for x >= 0, and a final linear classifier layer that maps the output to the number of classes in the classification task. <br>
The LSTM layer is initialized with the specified number of layers, hidden size, and bidirectional flag. The input sequences are expected to be of variable length and are handled using PyTorch's pack_padded_sequence and pad_packed_sequence functions.

In [None]:
class BiLSTM_Glove(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim, lstm_layers,
                 bidirectional, dropout_val, num_classes, emb_matrix):
        super(BiLSTM_Glove, self).__init__()
        #Hyper Parameters
        #hidden_dim = 256
        self.hidden_dim = hidden_dim
        #LSTM Layers = 1
        self.lstm_layers = lstm_layers
        #Embedding Dimension = 100
        self.embedding_dim = embedding_dim
        #Linear Ouput Dimension = 128
        self.output_dim = output_dim
        self.num_classes = num_classes
        self.emb_matrix = emb_matrix
        self.num_directions = 2 if bidirectional else 1

        #Creating Network
        #Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(emb_matrix))
        self.LSTM = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        self.fc = nn.Linear(hidden_dim*self.num_directions, output_dim)
        self.dropout = nn.Dropout(dropout_val)
        self.elu = nn.ELU(alpha=0.01)
        self.classifier = nn.Linear(output_dim, self.num_classes)

    def init_hidden(self, batch_size):
        h, c = (torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device),
                torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device))
        return h, c

    def forward(self, input_seq, seq_len):
        #Set initial states
        batch_size = input_seq.shape[0]
        h_0, c_0 = self.init_hidden(batch_size)

        #Forward pass LSTM
        #Embedding Layer
        embedded = self.embedding(input_seq).float()
        #LSTM Layer
        packed_embedded = pack_padded_sequence(embedded, seq_len, batch_first=True, enforce_sorted=False)
        output, _ = self.LSTM(packed_embedded, (h_0, c_0))
        output_unpacked, _ = pad_packed_sequence(output, batch_first=True)
        #Output Layers
        dropout = self.dropout(output_unpacked)
        lin = self.fc(dropout)
        pred = self.elu(lin)
        pred = self.classifier(pred)
        return pred

###BiLSTM_DataLoader <br>
Here, inputs is a list of tokenized text data and targets is a list of labels corresponding to each input text instance. <br>
__get_item__ method returns a tuple of two tensors where input_instance is a tensor of token indices representing one input text instance and target_instance is a tensor of the corresponding label. <br>
### BiLSTM_TestLoader <br>
Here, BiLSTM_TestLoader takes only one argument=> inputs which is a list of tokenized text data. <br>
__getitem__ method returns a single tensor input_instance of token indices representing one input text instance. The purpose of this loader is to only load the test data during inference, where labels are not available.

In [None]:
class BiLSTM_DataLoader(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_instance = torch.tensor(self.inputs[index])
        target_instance = torch.tensor(self.targets[index])
        return input_instance, target_instance

class BiLSTM_TestLoader(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_instance = torch.tensor(self.inputs[index])
        return input_instance


In [None]:
class data_collator(object):

    def __init__(self, vocab, label):
        self.params = vocab
        self.label = label

    def __call__(self, batch):
        (input_data, target_data) = zip(*batch)
        input_len = [len(x) for x in input_data]
        target_len = [len(y) for y in target_data]
        batch_max_len = max([len(s) for s in input_data])
        batch_data = self.params['<PAD>']*np.ones((len(input_data), batch_max_len))
        batch_labels = -1*np.zeros((len(input_data), batch_max_len))
        for j in range(len(input_data)):
            cur_len = len(input_data[j])
            batch_data[j][:cur_len] = input_data[j]
            batch_labels[j][:cur_len] = target_data[j]

        batch_data, batch_labels = torch.LongTensor(
            batch_data), torch.LongTensor(batch_labels)
        batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)

        return batch_data, batch_labels, input_len, target_len

class test_data_collator(object):

    def __init__(self, vocab, label):
        self.params = vocab
        self.label = label

    def __call__(self, batch):
        input_data = batch
        input_len = [len(x) for x in input_data]
        batch_max_len = max([len(s) for s in input_data])
        batch_data = self.params['<PAD>']*np.ones((len(input_data), batch_max_len))
        for j in range(len(input_data)):
            cur_len = len(input_data[j])
            batch_data[j][:cur_len] = input_data[j]

        batch_data = torch.LongTensor(batch_data)
        batch_data = Variable(batch_data)

        return batch_data, input_len

The prepare_vocabulary function takes a dataset (list of sentences) as input and returns the set of unique words in the dataset.

In [None]:
def prepare_vocabulary(dataset):

    vocab = set()
    for sentence in dataset:
        for word in sentence:
            vocab.add(word)
    return vocab

prepare_word_index takes three lists of sentences (training, validation, and test) and creates a word-to-index dictionary that maps each unique word to a unique index.

In [None]:
def prepare_word_index(train_data, val_data, test_data):

    word_index = {"<PAD>": 0, "<UNK>": 1}
    idx = 2

    for data in [train_data, val_data, test_data]:
        for sent in data:
            for word in sent:
                if word not in word_index:
                    word_index[word] = idx
                    idx += 1
    #with open('vocab.json', 'w') as f:
        #json.dump(word_index, f)
    return word_index


The vectorize_sentences function takes the training dataset and the word-to-index dictionary word_index as input and converts each sentence to a list of word indices based on the mapping in word_index.

In [None]:
def vectorize_sentences(train_data, word_index):

    train_data_vec = list()
    tmp_x = list()
    for words in train_data:
        for word in words:
            tmp_x.append(word_index[word])
        train_data_vec.append(tmp_x)
        tmp_x = list()

    return train_data_vec


prepare_label_dict takes the training and validation target datasets as input and returns a dictionary that maps each unique tag in the datasets to a unique index.

In [None]:
def prepare_label_dict(train_labels, val_labels):

    label1 = prepare_vocabulary(train_labels)
    label2 = prepare_vocabulary(val_labels)
    labels = label1.union(label2)
    label_tuples = []
    counter = 0
    for tag in labels:
        label_tuples.append((tag, counter))
        counter += 1
    label_dict = dict(label_tuples)
    #with open('label.json', 'w') as f:
        #json.dump(label_dict, f)
    return label_dict

vectorize_labels takes the training target dataset and the label dictionary as input and converts each target sequence to a list of tag indices based on the mapping in label_dict.

In [None]:
def vectorize_labels(train_labels, label_dict):

    train_labels_vec = list()
    for tags in train_labels:
        tmp_tags = list()
        for label in tags:
            tmp_tags.append(label_dict[label])
        train_labels_vec.append(tmp_tags)
    return train_labels_vec

In [None]:
word_idx = prepare_word_index(train_x, val_x, test_x)
train_x_vec = vectorize_sentences(train_x, word_idx)
test_x_vec = vectorize_sentences(test_x, word_idx)
val_x_vec = vectorize_sentences(val_x, word_idx)
label_dict = prepare_label_dict(train_y, val_y)
train_y_vec = vectorize_labels(train_y, label_dict)
val_y_vec = vectorize_labels(val_y, label_dict)

The create_embedding_matrix function takes a word-to-index dictionary, a dictionary mapping words to their corresponding GloVe embeddings, and an embedding dimension as input. It returns a matrix where each row corresponds to a word in word_index and the corresponding row contains its GloVe embedding. If a word is not present in embedding_dict, its embedding is set to either the embedding of its lowercase version (if present in embedding_dict) or to an embedding.

In [None]:
def create_embedding_matrix(word_index, embedding_dict, dimension):

    embedding_matrix = np.zeros((len(word_index), dimension))
    for word, index in word_index.items():
        if word in embedding_dict:
            embedding_matrix[index] = embedding_dict[word]
        else:
            if word.lower() in embedding_dict:
                embedding_matrix[index] = embedding_dict[word.lower()] + 5e-3
            else:
                embedding_matrix[index] = embedding_dict["<UNK>"]
    #np.save('embedding_matrix.npy', embedding_matrix)
    return embedding_matrix


Here, we are assigning class weights to handle class imbalance in the training data. By assigning higher weights to under-represented classes, the model is encouraged to pay more attention to these classes during training. The class weights are calculated using the formula:

class_weight = ln(0.35 * total_number_of_tags / frequency_of_label)

In [None]:
def initialize_class_weights(label_dict, train_y, val_y):
    class_weights = dict()
    for key in label_dict:
        class_weights[key] = 0
    total_nm_tags = 0
    for data in [train_y, val_y]:
        for tags in data:
            for tag in tags:
                total_nm_tags += 1
                class_weights[tag] += 1

    class_wt = list()
    for key in class_weights.keys():
        if class_weights[key]:
            score = round(math.log(0.35*total_nm_tags / class_weights[key]), 2)
            class_weights[key] = score if score > 1.0 else 1.0
        else:
            class_weights[key] = 1.0
        class_wt.append(class_weights[key])
    class_wt = torch.tensor(class_wt)
    return class_wt


class_wt = initialize_class_weights(label_dict, train_y, val_y)

In [None]:
BiLSTM_model = BiLSTM(vocab_size=len(word_idx),
                      embedding_dim=100,
                      output_dim=128,
                      hidden_dim=256,
                      lstm_layers=1,
                      bidirectional=True,
                      dropout_val=0.33,
                      num_classes=len(label_dict))
BiLSTM_model.to(device)
print(BiLSTM_model)

BiLSTM_train = BiLSTM_DataLoader(train_x_vec, train_y_vec)
custom_collator = data_collator(word_idx, label_dict)
dataloader = DataLoader(dataset=BiLSTM_train,
                        batch_size=4,
                        drop_last=True,
                        collate_fn=custom_collator)

criterion = nn.CrossEntropyLoss(weight=class_wt)
criterion = criterion.to(device)
criterion.requres_grad = True
optimizer = torch.optim.SGD(BiLSTM_model.parameters(), lr=0.1, momentum=0.9)
epochs = 200
#epochs = 5

for i in range(1, epochs+1):
    train_loss = 0.0
    for input, label, input_len, label_len in dataloader:
        optimizer.zero_grad()
        output = BiLSTM_model(input.to(device), input_len)
        output = output.view(-1, len(label_dict))
        label = label.view(-1)
        loss = criterion(output, label.to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * input.size(1)

    train_loss = train_loss / len(dataloader.dataset)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(i, train_loss))
    torch.save(BiLSTM_model.state_dict(),
               'BiLSTM_b1_epoch_' + str(i) + '.pt')


BiLSTM(
  (embedding): Embedding(30292, 100)
  (LSTM): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=0.01)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
Epoch: 1 	Training Loss: 2.669663
Epoch: 2 	Training Loss: 1.857107
Epoch: 3 	Training Loss: 1.366157
Epoch: 4 	Training Loss: 1.046358
Epoch: 5 	Training Loss: 0.784568
Epoch: 6 	Training Loss: 0.601561
Epoch: 7 	Training Loss: 0.470704
Epoch: 8 	Training Loss: 0.370218
Epoch: 9 	Training Loss: 0.288217
Epoch: 10 	Training Loss: 0.231052
Epoch: 11 	Training Loss: 0.205535
Epoch: 12 	Training Loss: 0.187001
Epoch: 13 	Training Loss: 0.154262
Epoch: 14 	Training Loss: 0.138245
Epoch: 15 	Training Loss: 0.107758
Epoch: 16 	Training Loss: 0.104288
Epoch: 17 	Training Loss: 0.087305
Epoch: 18 	Training Loss: 0.081432
Epoch: 19 	Training Loss: 0.061895
Epoch: 20 	Training Loss: 0.058798
Ep

In [None]:
BiLSTM_dev = BiLSTM_DataLoader(val_x_vec, val_y_vec)
custom_collator = data_collator(word_idx, label_dict)
dataloader_dev = DataLoader(dataset=BiLSTM_dev,
                            batch_size=1,
                            shuffle=False,
                            drop_last=True,
                            collate_fn=custom_collator)
for e in range(1,epochs + 1):
    BiLSTM_model.load_state_dict(torch.load("./BiLSTM_b1_epoch_"+str(e)+".pt"))#125
    BiLSTM_model.to(device)

    
    #print(label_dict)
    rev_label_dict = {v: k for k, v in label_dict.items()}
    rev_vocab_dict = {v: k for k, v in word_idx.items()}

    file = open("./dev1.out", 'w')
    for dev_data, label, dev_data_len, label_data_len in dataloader_dev:

        pred = BiLSTM_model(dev_data.to(device), dev_data_len)
        pred = pred.cpu()
        pred = pred.detach().numpy()
        label = label.detach().numpy()
        dev_data = dev_data.detach().numpy()
        pred = np.argmax(pred, axis=2)
        pred = pred.reshape((len(label), -1))

        for i in range(len(dev_data)):
            for j in range(len(dev_data[i])):
                if dev_data[i][j] != 0:
                    word = rev_vocab_dict[dev_data[i][j]]
                    gold = rev_label_dict[label[i][j]]
                    op = rev_label_dict[pred[i][j]]
                    file.write(" ".join([str(j+1), word, gold, op]))
                    file.write("\n")
            file.write("\n")
    file.close()
    #!perl conll03eval.txt < dev1.out


In [None]:
!perl conll03eval.txt < dev1.out

processed 51578 tokens with 5942 phrases; found: 5712 phrases; correct: 4418.
accuracy:  95.10%; precision:  77.35%; recall:  74.35%; FB1:  75.82
              LOC: precision:  84.42%; recall:  81.44%; FB1:  82.90  1772
             MISC: precision:  81.43%; recall:  75.60%; FB1:  78.40  856
              ORG: precision:  66.25%; recall:  71.29%; FB1:  68.68  1443
              PER: precision:  77.33%; recall:  68.89%; FB1:  72.87  1641


In [None]:
#Testing on Test Dataset
BiLSTM_test = BiLSTM_TestLoader(test_x_vec)
custom_test_collator = test_data_collator(word_idx, label_dict)
dataloader_test = DataLoader(dataset=BiLSTM_test,
                                batch_size=1,
                                shuffle=False,
                                drop_last=True,
                                collate_fn=custom_test_collator)
for e in range(1,epochs + 1):
    BiLSTM_model.load_state_dict(torch.load("./BiLSTM_b1_epoch_"+str(e)+".pt"))
    BiLSTM_model.to(device)

    
    #print(label_dict)
    rev_label_dict = {v: k for k, v in label_dict.items()}
    rev_vocab_dict = {v: k for k, v in word_idx.items()}

    file = open("test1.out", 'w')
    for test_data, test_data_len in dataloader_test:

        pred = BiLSTM_model(test_data.to(device), test_data_len)
        pred = pred.cpu()
        pred = pred.detach().numpy()
        test_data = test_data.detach().numpy()
        pred = np.argmax(pred, axis=2)
        pred = pred.reshape((len(test_data), -1))
        
        for i in range(len(test_data)):
            for j in range(len(test_data[i])):
                if test_data[i][j] != 0:
                    word = rev_vocab_dict[test_data[i][j]]
                    op = rev_label_dict[pred[i][j]]
                    file.write(" ".join([str(j+1), word, op]))
                    file.write("\n")

            file.write("\n")        
    file.close()

### Task 2

Here, we read the pre-trained GloVe word embeddings from the given file, and create an embedding matrix for the words in the vocabulary of the training, validation, and test data

In [None]:

glove = pd.read_csv('./glove.6B.100d.txt', sep=" ",
                    quoting=3, header=None, index_col=0)
glove_emb = {key: val.values for key, val in glove.T.items()}

word_idx = prepare_word_index(train_x, val_x, test_x)
glove_vec = np.array([glove_emb[key] for key in glove_emb])
glove_emb["<PAD>"] = np.zeros((100,), dtype="float64")
glove_emb["<UNK>"] = np.mean(glove_vec, axis=0, keepdims=True).reshape(100,)
emb_matrix = create_embedding_matrix(word_idx, glove_emb, 100)

vocab_size = emb_matrix.shape[0]
vector_size = emb_matrix.shape[1]
print(vocab_size, vector_size)

30292 100


In [None]:
BiLSTM_model = BiLSTM_Glove(vocab_size=len(word_idx),
                      embedding_dim=100,
                      output_dim=128,
                      hidden_dim=256,
                      lstm_layers=1,
                      bidirectional=True,
                      dropout_val=0.33,
                      num_classes=len(label_dict),
                      emb_matrix=emb_matrix)
BiLSTM_model.to(device)
print(BiLSTM_model)

BiLSTM_train = BiLSTM_DataLoader(train_x_vec, train_y_vec)
custom_collator = data_collator(word_idx, label_dict)
dataloader = DataLoader(dataset=BiLSTM_train,
                        batch_size= 8,
                        drop_last=True,
                        collate_fn=custom_collator)

criterion = nn.CrossEntropyLoss(weight=class_wt)
criterion = criterion.to(device)
criterion.requres_grad = True
optimizer = torch.optim.SGD(BiLSTM_model.parameters(), lr=0.1, momentum=0.9)
scheduler = StepLR(optimizer, step_size=15, gamma=0.9)
epochs = 50

for i in range(1, epochs+1):
    train_loss = 0.0
    for input, label, input_len, label_len in dataloader:
        optimizer.zero_grad()
        output = BiLSTM_model(input.to(device), input_len)
        output = output.view(-1, len(label_dict))
        label = label.view(-1)
        loss = criterion(output, label.to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * input.size(1)

    train_loss = train_loss / len(dataloader.dataset)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(i, train_loss))
    torch.save(BiLSTM_model.state_dict(),
               'BiLSTM_glove_' + str(i) + '.pt')

BiLSTM_Glove(
  (embedding): Embedding(30292, 100)
  (LSTM): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=0.01)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
Epoch: 1 	Training Loss: 0.722542
Epoch: 2 	Training Loss: 0.338636
Epoch: 3 	Training Loss: 0.225984
Epoch: 4 	Training Loss: 0.174482
Epoch: 5 	Training Loss: 0.137744
Epoch: 6 	Training Loss: 0.110661
Epoch: 7 	Training Loss: 0.090044
Epoch: 8 	Training Loss: 0.075623
Epoch: 9 	Training Loss: 0.062208
Epoch: 10 	Training Loss: 0.052554
Epoch: 11 	Training Loss: 0.044381
Epoch: 12 	Training Loss: 0.036774
Epoch: 13 	Training Loss: 0.031368
Epoch: 14 	Training Loss: 0.027189
Epoch: 15 	Training Loss: 0.023500
Epoch: 16 	Training Loss: 0.020209
Epoch: 17 	Training Loss: 0.018036
Epoch: 18 	Training Loss: 0.018845
Epoch: 19 	Training Loss: 0.015091
Epoch: 20 	Training Loss: 0.012

In [None]:
#predicting for validation dataset
BiLSTM_dev = BiLSTM_DataLoader(val_x_vec, val_y_vec)
custom_collator = data_collator(word_idx, label_dict)
dataloader_dev = DataLoader(dataset=BiLSTM_dev,
                            batch_size=1,
                            shuffle=False,
                            drop_last=True,
                            collate_fn=custom_collator)
for e in range(1, 51):
    BiLSTM_model = BiLSTM_Glove(vocab_size=len(word_idx),
                        embedding_dim=100,
                        output_dim=128,
                        hidden_dim=256,
                        lstm_layers=1,
                        bidirectional=True,
                        dropout_val=0.33,
                        num_classes=len(label_dict),
                        emb_matrix = emb_matrix)

    BiLSTM_model.load_state_dict(torch.load("./BiLSTM_glove_"+str(e)+".pt"))
    BiLSTM_model.to(device)
    rev_label_dict = {v: k for k, v in label_dict.items()}
    rev_vocab_dict = {v: k for k, v in word_idx.items()}
    
    file = open("dev2.out", 'w')
    for dev_data, label, dev_data_len, label_data_len in dataloader_dev:

        pred = BiLSTM_model(dev_data.to(device), dev_data_len)
        pred = pred.cpu()
        pred = pred.detach().numpy()
        label = label.detach().numpy()
        dev_data = dev_data.detach().numpy()
        pred = np.argmax(pred, axis=2)
        pred = pred.reshape((len(label), -1))

        for i in range(len(dev_data)):
            for j in range(len(dev_data[i])):
                if dev_data[i][j] != 0:
                    word = rev_vocab_dict[dev_data[i][j]]
                    gold = rev_label_dict[label[i][j]]
                    op = rev_label_dict[pred[i][j]]
                    file.write(" ".join([str(j+1), word, gold, op]))
                    file.write("\n")
            file.write("\n")
    file.close()

    #!perl conll03eval.txt < dev2.out

In [None]:
!perl conll03eval.txt < dev2.out

processed 51578 tokens with 5942 phrases; found: 6054 phrases; correct: 5357.
accuracy:  98.04%; precision:  88.49%; recall:  90.15%; FB1:  89.31
              LOC: precision:  93.32%; recall:  94.28%; FB1:  93.80  1856
             MISC: precision:  81.82%; recall:  83.95%; FB1:  82.87  946
              ORG: precision:  82.52%; recall:  84.86%; FB1:  83.68  1379
              PER: precision:  91.46%; recall:  93.00%; FB1:  92.22  1873


In [None]:
BiLSTM_test = BiLSTM_TestLoader(test_x_vec)
custom_test_collator = test_data_collator(word_idx, label_dict)
dataloader_test = DataLoader(dataset=BiLSTM_test,
                                batch_size=1,
                                shuffle=False,
                                drop_last=True,
                                collate_fn=custom_test_collator)

epochs = 50
for e in range(1,epochs+1 ):
    BiLSTM_model.load_state_dict(torch.load("./BiLSTM_glove_"+str(e)+".pt"))
    BiLSTM_model.to(device)
    rev_label_dict = {v: k for k, v in label_dict.items()}
    rev_vocab_dict = {v: k for k, v in word_idx.items()}

    file = open("test2.out", 'w')
    for test_data, test_data_len in dataloader_test:

        pred = BiLSTM_model(test_data.to(device), test_data_len)
        pred = pred.cpu()
        pred = pred.detach().numpy()
        test_data = test_data.detach().numpy()
        pred = np.argmax(pred, axis=2)
        pred = pred.reshape((len(test_data), -1))
        
        for i in range(len(test_data)):
            for j in range(len(test_data[i])):
                if test_data[i][j] != 0:
                    word = rev_vocab_dict[test_data[i][j]]
                    op = rev_label_dict[pred[i][j]]
                    file.write(" ".join([str(j+1), word, op]))
                    file.write("\n")

            file.write("\n")        
    file.close()