# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import random
import json
torch.manual_seed(0)
random.seed(0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Preparing Data

In [2]:
# df_train = pd.read_csv('./data/train', sep=' ', names = ['id', 'word', 'NER'])
df_train = list()
with open('./data/train', 'r') as f:
    for line in f.readlines():
        if len(line) > 1:
            id, word, ner= line.strip().split(" ")
            df_train.append([id, word, ner])

df_train = pd.DataFrame(df_train, columns=['id', 'word', 'NER'])
df_train = df_train.dropna()


In [3]:
train_x, train_y = [], []
x, y = [], []
first=1

for row in df_train.itertuples():
    if(row.id == '1' and first == 0):
        train_x.append(x)
        train_y.append(y)
        x=[]
        y=[]
    first=0
    x.append(row.word)
    y.append(row.NER)


In [4]:
# df_dev = pd.read_csv('./data/dev', sep=' ', names = ['id', 'word', 'NER'])
df_dev = list()
with open('./data/dev', 'r') as f:
    for line in f.readlines():
        if len(line) > 1:
            id, word, ner = line.strip().split(" ")
            df_dev.append([id, word, ner])

df_dev = pd.DataFrame(df_dev, columns=['id', 'word', 'NER'])
df_dev = df_dev.dropna()


In [5]:
dev_x, dev_y = [], []
x, y = [], []
first=1

for row in df_dev.itertuples():
    if(row.id == '1' and first == 0):
        dev_x.append(x)
        dev_y.append(y)
        x=[]
        y=[]
    first=0
    x.append(row.word)
    y.append(row.NER)


In [6]:
df_test = list()
with open('./data/test', 'r') as f:
    for line in f.readlines():
        if len(line) > 1:
            id, word = line.strip().split(" ")
            df_test.append([id, word])

df_test = pd.DataFrame(df_test, columns=['id', 'word'])
df_test = df_test.dropna()


In [7]:
test_x = []
x = []
first=1

for row in df_test.itertuples():
    if(row.id == '1' and first == 0):
        test_x.append(x)
        x=[]
    first=0
    x.append(row.word)
    

In [8]:
train_x

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22'],
 ['The',
  'European',
  'Commission',
  'said',
  'on',
  'Thursday',
  'it',
  'disagreed',
  'with',
  'German',
  'advice',
  'to',
  'consumers',
  'to',
  'shun',
  'British',
  'lamb',
  'until',
  'scientists',
  'determine',
  'whether',
  'mad',
  'cow',
  'disease',
  'can',
  'be',
  'transmitted',
  'to',
  'sheep',
  '.'],
 ['Germany',
  "'s",
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'s",
  'veterinary',
  'committee',
  'Werner',
  'Zwingmann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheepmeat',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 ['"',
  'We',
  'do',
  "n't",
  'support',
  'any',
  'such',
  'recommendation',
  'because',
  'we',
  'do',
  "n't",
  'see',
  'any',
  'grounds',
  'fo

# Creating vocabulary and labels

In [9]:
index=1
word2idx={'<pad>': 0}
for x in [train_x, dev_x, test_x]:
    for sentence in x:
        for word in sentence:
            if word not in word2idx:
                word2idx[word] = index
                index+=1
                

In [10]:
len(word2idx)

30291

In [11]:
labels = set()
label_dict = {}
index=0

for x in [train_y, dev_y]:
    for sentence in x:
        for label in sentence:
            labels.add(label)
            if label not in label_dict:
                label_dict[label] = index
                index+=1
                

In [12]:
label_dict

{'B-ORG': 0,
 'O': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8}

# Vectorizing sentences and labels

In [13]:
train_x_vec = []
x = []

for words in train_x:
    for word in words:
        x.append(word2idx[word])
    train_x_vec.append(x)
    x = []


In [14]:
dev_x_vec = []
x = []

for words in dev_x:
    for word in words:
        x.append(word2idx[word])
    dev_x_vec.append(x)
    x = []


In [15]:
test_x_vec = []
x = []

for words in test_x:
    for word in words:
        x.append(word2idx[word])
    test_x_vec.append(x)
    x = []


In [16]:
train_y_vec = []

for tags in train_y:
    y = []
    for label in tags:
        y.append(label_dict[label])
    train_y_vec.append(y)


In [17]:
dev_y_vec = []

for tags in dev_y:
    y = []
    for label in tags:
        y.append(label_dict[label])
    dev_y_vec.append(y)


# Bidirectional LSTM 

In [18]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, linear_out_dim, hidden_dim, lstm_layers,
                 bidirectional, dropout_val, tag_size):
        super(BiLSTM, self).__init__()
        """ Hyper Parameters """
        self.hidden_dim = hidden_dim  # hidden_dim = 256
        self.lstm_layers = lstm_layers  # LSTM Layers = 1
        self.embedding_dim = embedding_dim  # Embedding Dimension = 100
        self.linear_out_dim = linear_out_dim  # Linear Ouput Dimension = 128
        self.tag_size = tag_size  # Tag Size = 9
        self.num_directions = 2 if bidirectional else 1

        """ Initializing Network """
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim)  # Embedding Layer
        self.embedding.weight.data.uniform_(-1,1)
        self.LSTM = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        self.fc = nn.Linear(hidden_dim*self.num_directions,
                            linear_out_dim)  # 2 for bidirection
        self.dropout = nn.Dropout(dropout_val)
        self.elu = nn.ELU(alpha=0.01)
        self.classifier = nn.Linear(linear_out_dim, self.tag_size)

    def init_hidden(self, batch_size):
        h, c = (torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device),
                torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device))
        return h, c

    def forward(self, sen, sen_len):  # sen_len
        # Set initial states
        batch_size = sen.shape[0]
        h_0, c_0 = self.init_hidden(batch_size)

        # Forward propagate LSTM
        embedded = self.embedding(sen).float()
        packed_embedded = pack_padded_sequence(
            embedded, sen_len, batch_first=True, enforce_sorted=False)
        output, _ = self.LSTM(packed_embedded, (h_0, c_0))
        output_unpacked, _ = pad_packed_sequence(output, batch_first=True)
        dropout = self.dropout(output_unpacked)
        lin = self.fc(dropout)
        pred = self.elu(lin)
        pred = self.classifier(pred)
        return pred
    

In [19]:
class BiLSTM_DataLoader(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        x_instance = torch.tensor(self.x[index])  # , dtype=torch.long
        y_instance = torch.tensor(self.y[index])  # , dtype=torch.float
        return x_instance, y_instance
    

In [20]:
class CustomCollator(object):

    def __init__(self, vocab, label):
        self.params = vocab
        self.label = label

    def __call__(self, batch):
        (xx, yy) = zip(*batch)
        x_len = [len(x) for x in xx]
        y_len = [len(y) for y in yy]
        batch_max_len = max([len(s) for s in xx])
        batch_data = self.params['<pad>']*np.ones((len(xx), batch_max_len))
        batch_labels = -1*np.zeros((len(xx), batch_max_len))
        for j in range(len(xx)):
            cur_len = len(xx[j])
            batch_data[j][:cur_len] = xx[j]
            batch_labels[j][:cur_len] = yy[j]

        batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)
        batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)

        return batch_data, batch_labels, x_len, y_len
    

In [21]:
def initialize_class_weights(label_dict, train_y, dev_y):
    class_weights = dict()
    for key in label_dict:
        class_weights[key] = 0
    total_nm_tags = 0
    for data in [train_y, dev_y]:
        for tags in data:
            for tag in tags:
                total_nm_tags += 1
                class_weights[tag] += 1

    class_wt = list()
    for key in class_weights.keys():
        if class_weights[key]:
            score = round(math.log(0.35*total_nm_tags / class_weights[key]), 2)
            class_weights[key] = score if score > 1.0 else 1.0
        else:
            class_weights[key] = 1.0
        class_wt.append(class_weights[key])
    class_wt = torch.tensor(class_wt)
    return class_wt


class_wt = initialize_class_weights(label_dict, train_y, dev_y)
print(class_wt)


tensor([2.4600, 1.0000, 3.0200, 2.3600, 2.7300, 2.3000, 3.0000, 4.0900, 4.1500])


In [22]:
BiLSTM_model = BiLSTM(vocab_size=len(word2idx),
                      embedding_dim=100,
                      linear_out_dim=128,
                      hidden_dim=256,
                      lstm_layers=1,
                      bidirectional=True,
                      dropout_val=0.33,
                      tag_size=len(label_dict))
# BiLSTM_model.load_state_dict(torch.load("./BiLSTM_epoch_10.pt"))
BiLSTM_model.to(device)
print(BiLSTM_model)

BiLSTM_train = BiLSTM_DataLoader(train_x_vec, train_y_vec)
custom_collator = CustomCollator(word2idx, label_dict)
dataloader = DataLoader(dataset=BiLSTM_train,
                        batch_size=4,
                        drop_last=True,
                        collate_fn=custom_collator)

criterion = nn.CrossEntropyLoss(weight=class_wt)
# criterion = nn.NLLLoss(weight=class_wt)
# criterion = loss_fn
criterion = criterion.to(device)
criterion.requres_grad = True
optimizer = torch.optim.SGD(BiLSTM_model.parameters(), lr=0.1, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min")
#scheduler = StepLR(optimizer, step_size=15, gamma=0.9)
epochs = 200

for i in range(1, epochs+1):
    train_loss = 0.0
    # scheduler.step(train_loss)
    for input, label, input_len, label_len in dataloader:
        optimizer.zero_grad()
        output = BiLSTM_model(input.to(device), input_len)  # input_len
        output = output.view(-1, len(label_dict))
        label = label.view(-1)
        loss = criterion(output, label.to(device))
        # print(loss)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * input.size(1)

    train_loss = train_loss / len(dataloader.dataset)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(i, train_loss))
    torch.save(BiLSTM_model.state_dict(),
               'BiLSTM_epoch_' + str(i) + '.pt')


BiLSTM(
  (embedding): Embedding(30291, 100)
  (LSTM): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=0.01)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
Epoch: 1 	Training Loss: 2.944907
Epoch: 2 	Training Loss: 2.057616
Epoch: 3 	Training Loss: 1.515170
Epoch: 4 	Training Loss: 1.148587
Epoch: 5 	Training Loss: 0.881196


In [23]:
#tesing on validation data
BiLSTM_dev = BiLSTM_DataLoader(dev_x_vec, dev_y_vec)
custom_collator = CustomCollator(word2idx, label_dict)
dataloader_dev = DataLoader(dataset=BiLSTM_dev,
                            batch_size=1,
                            shuffle=False,
                            drop_last=True,
                            collate_fn=custom_collator)

# Reverse vocab and label Dictionary                            
rev_label_dict = {v: k for k, v in label_dict.items()}
rev_vocab_dict = {v: k for k, v in word2idx.items()}


file = open("dev1_train.out", 'w')
for dev_data, label, dev_data_len, label_data_len in dataloader_dev:

    pred = BiLSTM_model(dev_data.to(device), dev_data_len)
    pred = pred.cpu()
    pred = pred.detach().numpy()
    label = label.detach().numpy()
    dev_data = dev_data.detach().numpy()
    pred = np.argmax(pred, axis=2)
    pred = pred.reshape((len(label), -1))

    for i in range(len(dev_data)):
        for j in range(len(dev_data[i])):
            if dev_data[i][j] != 0:
                word = rev_vocab_dict[dev_data[i][j]]
                gold = rev_label_dict[label[i][j]]
                op = rev_label_dict[pred[i][j]]
                file.write(" ".join([str(j+1), str(word), gold, op]))
                file.write("\n")
        file.write("\n")

file.close()


In [24]:
!perl conll03eval.txt < dev1_train.out

processed 51577 tokens with 5942 phrases; found: 7459 phrases; correct: 3870.
accuracy:  90.30%; precision:  51.88%; recall:  65.13%; FB1:  57.76
              LOC: precision:  69.97%; recall:  72.56%; FB1:  71.25  1905
             MISC: precision:  58.85%; recall:  59.87%; FB1:  59.35  938
              ORG: precision:  44.34%; recall:  53.77%; FB1:  48.60  1626
              PER: precision:  42.27%; recall:  68.62%; FB1:  52.32  2990


# GloVe Word Embeddings

In [25]:
def create_emb_matrix(word_idx, emb_dict, dimension):

    emb_matrix = np.zeros((len(word_idx), dimension))
    for word, idx in word_idx.items():
        if word in emb_dict:
            emb_matrix[idx] = emb_dict[word]
        else:
            if word.lower() in emb_dict:
                emb_matrix[idx] = emb_dict[word.lower()] + 5e-3
            else:
                pass
                #emb_matrix[idx] = emb_dict["<unk>"]
    return emb_matrix

In [26]:
glove = pd.read_csv('./glove.6B.100d', sep=" ",quoting=3, header=None, index_col=0)
glove_emb = {key: val.values for key, val in glove.T.items()}

# word_idx = prep_word_index(train_x, dev_x, test_x)
glove_vec = np.array([glove_emb[key] for key in glove_emb])
glove_emb["<pad>"] = np.zeros((100,), dtype="float64")
# glove_emb["<unk>"] = np.mean(glove_vec, axis=0, keepdims=True).reshape(100,)
emb_matrix = create_emb_matrix(word_idx=word2idx, emb_dict=glove_emb, dimension=100)

# emb_matrix = np.load('emb_matrix.npy')

vocab_size = emb_matrix.shape[0]
vector_size = emb_matrix.shape[1]
print(vocab_size, vector_size)

30291 100


In [27]:
class BiLSTM_glove(nn.Module):
    def __init__(self, vocab_size, embedding_dim, linear_out_dim, hidden_dim, lstm_layers,
                 bidirectional, dropout_val, tag_size, emb_matrix):
        super(BiLSTM_glove, self).__init__()
        """ Hyper Parameters """
        self.hidden_dim = hidden_dim  # hidden_dim = 256
        self.lstm_layers = lstm_layers  # LSTM Layers = 1
        self.embedding_dim = embedding_dim  # Embedding Dimension = 100
        self.linear_out_dim = linear_out_dim  # Linear Ouput Dimension = 128
        self.tag_size = tag_size  # Tag Size = 9
        self.emb_matrix = emb_matrix
        self.num_directions = 2 if bidirectional else 1

        """ Initializing Network """
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding Layer
        self.embedding.weight = nn.Parameter(torch.tensor(emb_matrix))
        self.LSTM = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        self.fc = nn.Linear(hidden_dim*self.num_directions, linear_out_dim)  # 2 for bidirection
        self.dropout = nn.Dropout(dropout_val)
        self.elu = nn.ELU(alpha=0.01)
        self.classifier = nn.Linear(linear_out_dim, self.tag_size)

    def init_hidden(self, batch_size):
        h, c = (torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device),
                torch.zeros(self.lstm_layers * self.num_directions,
                            batch_size, self.hidden_dim).to(device))
        return h, c

    def forward(self, sen, sen_len):  # sen_len
        # Set initial states
        batch_size = sen.shape[0]
        h_0, c_0 = self.init_hidden(batch_size)

        # Forward propagate LSTM
        embedded = self.embedding(sen).float()
        packed_embedded = pack_padded_sequence(embedded, sen_len, batch_first=True, enforce_sorted=False)
        output, _ = self.LSTM(packed_embedded, (h_0, c_0))
        output_unpacked, _ = pad_packed_sequence(output, batch_first=True)
        dropout = self.dropout(output_unpacked)
        lin = self.fc(dropout)
        pred = self.elu(lin)
        pred = self.classifier(pred)
        return pred

In [28]:
BiLSTM_model = BiLSTM_glove(vocab_size=len(word2idx),
                      embedding_dim=100,
                      linear_out_dim=128,
                      hidden_dim=256,
                      lstm_layers=1,
                      bidirectional=True,
                      dropout_val=0.33,
                      tag_size=len(label_dict),
                      emb_matrix=emb_matrix)
# BiLSTM_model.load_state_dict(torch.load("./BiLSTM_glove_20.pt"))
BiLSTM_model.to(device)
print(BiLSTM_model)

BiLSTM_train = BiLSTM_DataLoader(train_x_vec, train_y_vec)
custom_collator = CustomCollator(word2idx, label_dict)
dataloader = DataLoader(dataset=BiLSTM_train,
                        batch_size=8,
                        drop_last=True,
                        collate_fn=custom_collator)

criterion = nn.CrossEntropyLoss(weight=class_wt)
# criterion = nn.NLLLoss(weight=class_wt)
# criterion = loss_fn
criterion = criterion.to(device)
criterion.requres_grad = True
optimizer = torch.optim.SGD(BiLSTM_model.parameters(), lr=0.1, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min")
scheduler = StepLR(optimizer, step_size=15, gamma=0.9)
epochs = 50

for i in range(1, epochs+1):
    train_loss = 0.0
    # scheduler.step(train_loss)
    for input, label, input_len, label_len in dataloader:
        optimizer.zero_grad()
        output = BiLSTM_model(input.to(device), input_len)  # input_len
        output = output.view(-1, len(label_dict))
        label = label.view(-1)
        loss = criterion(output, label.to(device))
        # print(loss)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * input.size(1)

    train_loss = train_loss / len(dataloader.dataset)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(i, train_loss))
    torch.save(BiLSTM_model.state_dict(),
               'BiLSTM_glove_' + str(i) + '.pt')



BiLSTM_glove(
  (embedding): Embedding(30291, 100)
  (LSTM): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=0.01)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)
Epoch: 1 	Training Loss: 0.768211
Epoch: 2 	Training Loss: 0.362263
Epoch: 3 	Training Loss: 0.249146
Epoch: 4 	Training Loss: 0.192236
Epoch: 5 	Training Loss: 0.152660


In [29]:
#predicting for validation dataset
BiLSTM_dev = BiLSTM_DataLoader(dev_x_vec, dev_y_vec)
custom_collator = CustomCollator(word2idx, label_dict)
dataloader_dev = DataLoader(dataset=BiLSTM_dev,
                            batch_size=8,
                            shuffle=False,
                            drop_last=True,
                            collate_fn=custom_collator)
print(label_dict)
rev_label_dict = {v: k for k, v in label_dict.items()}
rev_vocab_dict = {v: k for k, v in word2idx.items()}

res = []
file = open("dev2_train.out", 'w')
for dev_data, label, dev_data_len, label_data_len in dataloader_dev:

    pred = BiLSTM_model(dev_data.to(device), dev_data_len)
    pred = pred.cpu()
    pred = pred.detach().numpy()
    label = label.detach().numpy()
    dev_data = dev_data.detach().numpy()
    pred = np.argmax(pred, axis=2)
    pred = pred.reshape((len(label), -1))

    for i in range(len(dev_data)):
        for j in range(len(dev_data[i])):
            if dev_data[i][j] != 0:
                word = rev_vocab_dict[dev_data[i][j]]
                gold = rev_label_dict[label[i][j]]
                op = rev_label_dict[pred[i][j]]
                res.append((word, gold, op))
                file.write(" ".join([str(j + 1), str(word), gold, op]))
                file.write("\n")
        file.write("\n")
file.close()


{'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


In [30]:
res = []
file = open("dev2_train.out", 'w')
for dev_data, label, dev_data_len, label_data_len in dataloader_dev:

    pred = BiLSTM_model(dev_data.to(device), dev_data_len)
    pred = pred.cpu()
    pred = pred.detach().numpy()
    label = label.detach().numpy()
    dev_data = dev_data.detach().numpy()
    pred = np.argmax(pred, axis=2)
    pred = pred.reshape((len(label), -1))

    for i in range(len(dev_data)):
        for j in range(len(dev_data[i])):
            if dev_data[i][j] != 0:
#                 print(dev_data[i][j])
#                 print(rev_vocab_dict[dev_data[i][j]])
                word = rev_vocab_dict[dev_data[i][j]]
                gold = rev_label_dict[label[i][j]]
                op = rev_label_dict[pred[i][j]]
                res.append((word, gold, op))
                
                file.write(" ".join([str(j + 1), word, gold, op]))
                file.write("\n")
        file.write("\n")
        
file.close()


In [31]:
!perl conll03eval.txt < dev2_train.out

processed 51573 tokens with 5941 phrases; found: 6284 phrases; correct: 5087.
accuracy:  96.82%; precision:  80.95%; recall:  85.63%; FB1:  83.22
              LOC: precision:  88.91%; recall:  91.24%; FB1:  90.06  1885
             MISC: precision:  66.27%; recall:  73.32%; FB1:  69.62  1020
              ORG: precision:  68.67%; recall:  77.54%; FB1:  72.84  1513
              PER: precision:  90.89%; recall:  92.07%; FB1:  91.48  1866
