## TASK 1

In [1]:
import pickle
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import string
from torch.utils.data import TensorDataset, DataLoader
from torchtext import data
from torchtext import datasets
import time

In [2]:
noun_list = ["tion", "ity", "er", "ness", "ism", "ment", "ant", "ship", "age", "ery"]
verb_list = ["ate", "ify", "ize", "ise"]
adj_list = ["able", "ible", 'ant', 'ent', 'ive', "al","ial","an","ian","ish", "ern", "ese", "ful", 'ar', 'ary','ly','less','ic','ive','ous', "i", "ic"]
adv_list = ["ly","lng","ward", "wards", "way", "ways", "wise"]

def processUnknowns(word):
    num = 0
    for char in word:
      if char.isdigit():
         num += 1
          
    fraction = num / float(len(word))
        
    if word.isdigit():
        return "<unk_num>"
    elif fraction > 0.5:
        return "<unk_mainly_num>"
    elif any(word.endswith(suffix) for suffix in verb_list):
        return "<unk_verb>"
    elif any(word.endswith(suffix) for suffix in adj_list):
        return "<unk_adj>"
    elif any(word.endswith(suffix) for suffix in adv_list):
        return "<unk_adv>"
    elif word.islower():
        return "<unk_all_lower>"    
    elif word.isupper():
        return "<unk_all_upper>"              
    elif word[0].isupper():
        return "<unk_initial_upper>"
    elif any(char.isdigit() for char in word):
        return "<unk_contain_num>"    
    else:
        return "<unk>"

In [3]:
def prepareVocabulary(file, min_count=2):
    vocab, NER_set, sentence, sentences = {}, set(), [], []
    with open(file, "r") as train:
        for line in train:
            if not line.split():
                sentences.append(sentence)
                sentence =[]
                continue
            word_type, NER_type = line.split(" ")[1], line.split(" ")[2].strip('\n')
            if word_type not in vocab:
                vocab[word_type] = 1
            else:
                vocab[word_type]+=1
            sentence.append([word_type,NER_type])
            NER_set.add(NER_type)
        sentences.append(sentence)
                
        vocab['<unk>'], vocab['<unk_mainly_num>'] = 0,0
        vocab['<unk_num>'], vocab['<unk_contain_num>'] = 0,0
        vocab['<unk_verb>'], vocab['<unk_adj>'] = 0,0
        vocab['<unk_adv>'], vocab['<unk_all_lower>'] = 0,0
        vocab['<unk_all_upper>'], vocab['<unk_initial_upper>'] = 0,0
        
        delete = []
        for word, occurrences in vocab.items():
            if occurrences >= min_count: 
                continue
            else:
                new_token = processUnknowns(word)
                vocab[new_token] += occurrences 
                delete.append(word)

        for i in delete:  
            del vocab[i]
    
    return vocab, NER_set, sentences

In [4]:
vocab, NER_set, sentences = prepareVocabulary('/content/drive/MyDrive/Colab Notebooks/data/train')
sortedVocabulary = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
Word_to_Index = {w: i+1 for i, (w, n) in enumerate(sortedVocabulary)}
Word_to_Index['PAD'] = 0
print(len(Word_to_Index))

11994


In [5]:
NER_to_Index = {}
i = 0
for ner in NER_set:
    NER_to_Index[ner] = i
    i += 1

Index_to_Word = {}
for key, value in Word_to_Index.items():
    Index_to_Word[value] = key

Index_to_Ner = {}
for key, value in NER_to_Index.items():
    Index_to_Ner[value] = key

print(len(Word_to_Index), len(NER_to_Index))

11994 9


In [6]:
data_X = []

for s in sentences:
    temp_X = []
    for w, label in s:
        if w in Word_to_Index:
            temp_X.append(Word_to_Index.get(w))
        else:
            unk = processUnknowns(w)
            temp_X.append(Word_to_Index[unk])
    data_X.append(temp_X)

data_y = []
for s in sentences:
    temp_y = []
    for w, label in s:
        temp_y.append(NER_to_Index.get(label))
    data_y.append(temp_y)

In [7]:
def padding_for_words(dataset, max_len):
    for i, line in enumerate(dataset):
        if len(line) > max_len:
            dataset[i] = line[:max_len]
        elif len(line) < max_len: 
            dataset[i] = line[:len(line)] + [0]*(max_len-len(line))
        
    return dataset

def padding_for_NER(dataset, max_len):
    for i, line in enumerate(dataset):
        if len(line) > max_len:
            dataset[i] = line[:max_len]
        elif len(line) < max_len:
            dataset[i] = line[:len(line)] + [-100]*(max_len-len(line))
        
    return dataset

data_X = padding_for_words(data_X, 130) 
data_y = padding_for_NER(data_y, 130)
X_train = torch.LongTensor(data_X)
Y_train = torch.LongTensor(data_y)
ds_train = TensorDataset(X_train, Y_train)
loader_train = DataLoader(ds_train, batch_size=10, shuffle=False)

print(len(Word_to_Index), len(NER_to_Index))

11994 9


In [8]:
isCuda = torch.cuda.is_available()

if isCuda:
    device = torch.device("cuda")
    print("-- cuda --")
else:
    device = torch.device("cpu")
    print("-- cpu --")

-- cuda --


In [9]:
class BLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, first_output_dim, output_dim, num_layers, bidirectional, drop_out): 
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.blstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = num_layers, bidirectional = bidirectional, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim * 2, first_output_dim)
        self.dropout = nn.Dropout(drop_out)
        self.activation = nn.ELU()
        self.fc2 = nn.Linear(first_output_dim, output_dim)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        outputs, (hidden, cell) = self.blstm(embedded)
        outputs = self.dropout(outputs)
        outputs = self.activation(self.fc1(outputs))
        predictions = self.fc2(outputs)
        return predictions

In [10]:
INPUT_DIM = len(Word_to_Index)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
FIRST_OUTPUT_DIM = 128
OUTPUT_DIM = len(NER_to_Index)
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.33

model = BLSTM(INPUT_DIM, 
              EMBEDDING_DIM, 
              HIDDEN_DIM, 
              FIRST_OUTPUT_DIM,
              OUTPUT_DIM, 
              N_LAYERS, 
              BIDIRECTIONAL, 
              DROPOUT)

model.to(device)
print(len(Word_to_Index), len(NER_to_Index))

11994 9


In [11]:
def categoricalAccuracy(preds, y, tag_pad_idx, text, predict_table):
    tot = 0
    correct = 0
    max_preds = preds.argmax(dim = 1, keepdim = True) 
    for predict, real, word in zip(max_preds, y, text):
        if real.item() == tag_pad_idx:
            continue
        else:
            predict_table.append((word.item(), predict.item(), real.item()))
            if real.item() == predict.item():
                correct += 1
            tot += 1
    return tot, correct, predict_table

In [12]:
def trainModel(model, dataloader, predict_table):

    epoch_loss = 0
    epoch_acc = 0
    epoch_tot = 0
    model.train()

    for text, tags in dataloader:
        
        optimizer.zero_grad()
        tags = tags.to(device)
        text = text.to(device)   
        predictions = model(text)
        predictions = predictions.view(-1, predictions.shape[-1]) 
        tags = tags.view(-1)

        loss = criterion(predictions, tags)

        tot, correct, predict_table = categoricalAccuracy(predictions, tags, tag_pad_idx, text.view(-1), predict_table)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += correct
        epoch_tot +=tot

    return epoch_loss / len(dataloader), epoch_acc / epoch_tot, predict_table


In [13]:
def evaluateModel(model, dataloader, predict_table):

    epoch_loss = 0
    epoch_acc = 0
    epoch_tot = 0
    model.eval()

    with torch.no_grad():

        for text, tags in dataloader:
            tags = tags.to(device)
            text = text.to(device)
            predictions = model(text)

            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)

            tot, correct, predict_table = categoricalAccuracy(predictions, tags, tag_pad_idx, text.view(-1), predict_table)

            epoch_loss += loss.item()
            epoch_acc += correct
            epoch_tot +=tot

    return epoch_loss / len(dataloader), epoch_acc / epoch_tot, predict_table

In [14]:
dev_sentences = []
sentence=[]
cnt=0
with open('/content/drive/MyDrive/Colab Notebooks/data/dev', "r") as dev:
    for line in dev:
        if not line.split():
            dev_sentences.append(sentence)
            sentence =[]
            continue
        word_type, NER_type = line.split(" ")[1], line.split(" ")[2].strip('\n')
        cnt+=1
        sentence.append([word_type,NER_type])
    dev_sentences.append(sentence)

In [15]:
dev_X = []
for s in dev_sentences:
    temp_X = []
    for w, label in s:
        if w in Word_to_Index:
            temp_X.append(Word_to_Index.get(w))
        else:
            unk = processUnknowns(w)
            temp_X.append(Word_to_Index[unk])
    dev_X.append(temp_X)

dev_y = []
for s in dev_sentences:
    temp_y = []
    for w, label in s:
        temp_y.append(NER_to_Index.get(label))
    dev_y.append(temp_y)

dev_X = padding_for_words(dev_X, 130)
dev_y = padding_for_NER(dev_y, 130)

X_dev = torch.LongTensor(dev_X)
Y_dev = torch.LongTensor(dev_y)
ds_dev = TensorDataset(X_dev, Y_dev)
loader_dev = DataLoader(ds_dev, batch_size=10, shuffle=False)

In [16]:
epochs = 20
tag_pad_idx=-100
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True)
criterion = nn.CrossEntropyLoss(ignore_index= -100)
best_valid_loss = float('inf')

for epoch in range(epochs):
    train_predict_table = []
    test_predict_table = []

    train_loss, train_acc, train_predict_table = trainModel(model, loader_train, train_predict_table)
    valid_loss, valid_acc, valid_predict_table = evaluateModel(model, loader_dev, test_predict_table)

    if valid_loss <= best_valid_loss:
        best_valid_loss = valid_loss
        best_predict_table = valid_predict_table
        torch.save(model.state_dict(), './blstm1.pt')
        
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.644 | Train Acc: 85.03%
	 Val. Loss: 0.448 |  Val. Acc: 88.28%
Epoch: 02
	Train Loss: 0.442 | Train Acc: 87.91%
	 Val. Loss: 0.307 |  Val. Acc: 91.27%
Epoch: 03
	Train Loss: 0.348 | Train Acc: 89.77%
	 Val. Loss: 0.244 |  Val. Acc: 93.03%
Epoch: 04
	Train Loss: 0.296 | Train Acc: 90.91%
	 Val. Loss: 0.210 |  Val. Acc: 93.89%
Epoch: 05
	Train Loss: 0.261 | Train Acc: 91.80%
	 Val. Loss: 0.198 |  Val. Acc: 94.19%
Epoch: 06
	Train Loss: 0.234 | Train Acc: 92.50%
	 Val. Loss: 0.179 |  Val. Acc: 94.66%
Epoch: 07
	Train Loss: 0.217 | Train Acc: 92.94%
	 Val. Loss: 0.165 |  Val. Acc: 95.01%
Epoch: 08
	Train Loss: 0.201 | Train Acc: 93.36%
	 Val. Loss: 0.159 |  Val. Acc: 95.22%
Epoch: 09
	Train Loss: 0.190 | Train Acc: 93.70%
	 Val. Loss: 0.151 |  Val. Acc: 95.47%
Epoch: 10
	Train Loss: 0.180 | Train Acc: 93.90%
	 Val. Loss: 0.147 |  Val. Acc: 95.61%
Epoch: 11
	Train Loss: 0.171 | Train Acc: 94.19%
	 Val. Loss: 0.143 |  Val. Acc: 95.69%
Epoch: 12
	Train Loss: 0.165 | T

In [17]:
term = [int(x[0]) for x in best_predict_table]
y_pred = [int(x[1]) for x in best_predict_table]
i=0
newfile = open('./dev1.out', "w")
with open('/content/drive/MyDrive/Colab Notebooks/data/dev', "r") as train:
    for line in train:
        if not line.split():
            newfile.write('\n')
            continue
        index, word_type = line.split(" ")[0], line.split(" ")[1].strip('\n')
        newfile.write(str(index)+' '+str(word_type)+' '+str(Index_to_Ner[y_pred[i]])+'\n')
        i += 1
newfile.close()

i=0
newfile = open('./dev1_perl.out', "w")
with open('/content/drive/MyDrive/Colab Notebooks/data/dev', "r") as train:
    for line in train:
        if not line.split():
            newfile.write('\n')
            continue
        index, word_type, NER_type = line.split(" ")[0], line.split(" ")[1], line.split(" ")[2].strip('\n')
        newfile.write(str(index)+' '+str(word_type)+' '+str(NER_type)+' '+str(Index_to_Ner[y_pred[i]])+'\n')
        i += 1
newfile.close()

In [18]:
!perl conll03eval.txt < dev1_perl.out

processed 51578 tokens with 5942 phrases; found: 5609 phrases; correct: 4567.
accuracy:  96.19%; precision:  81.42%; recall:  76.86%; FB1:  79.08
              LOC: precision:  87.78%; recall:  84.10%; FB1:  85.90  1760
             MISC: precision:  73.21%; recall:  76.46%; FB1:  74.80  963
              ORG: precision:  72.74%; recall:  64.28%; FB1:  68.25  1185
              PER: precision:  85.54%; recall:  78.99%; FB1:  82.13  1701


In [19]:
def categoricalEvaluate(preds, text, predictTable):

    max_preds = preds.argmax(dim = 1, keepdim = True)
    for predict, word in zip(max_preds, text):
        if word == 0:
            continue
        else:
            predictTable.append((word, predict[0]))

    return predictTable

In [20]:
def evaluateModel(model, loader, predictTable):

    epoch_loss = 0
    epoch_acc = 0
    epoch_total = 0
    model.eval()

    with torch.no_grad():

        for text in loader:
            text = text.to(device)
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])

            predictTable = categoricalEvaluate(predictions, text.view(-1), predictTable)

    return predictTable

In [21]:
test_X = []
sentence = []
cnt=0
with open('/content/drive/MyDrive/Colab Notebooks/data/dev', "r") as test:
    for line in test:
        if not line.split():
            test_X.append(sentence)
            sentence = []
            continue
        word_type = line.split(" ")[1]
        if word_type in Word_to_Index:
            sentence.append(Word_to_Index.get(word_type))
        else:
            unk = processUnknowns(word_type)
            sentence.append(Word_to_Index.get(unk))
    test_X.append(sentence)

test_X = padding_for_words(test_X, 130)
X_test = torch.LongTensor(test_X)
loader_test = DataLoader(X_test, batch_size=10, shuffle=False)

evaluate_predict_table2 = []
model = BLSTM(INPUT_DIM, 
              EMBEDDING_DIM, 
              HIDDEN_DIM, 
              FIRST_OUTPUT_DIM,
              OUTPUT_DIM, 
              N_LAYERS, 
              BIDIRECTIONAL, 
              DROPOUT)
model.to(device)
model.load_state_dict(torch.load('./blstm1.pt'))
prediction_table = evaluateModel(model, loader_test, evaluate_predict_table2)

term = [int(x[0]) for x in evaluate_predict_table2]
y_pred = [int(x[1]) for x in evaluate_predict_table2]

i=0
newfile = open('./test1.out', "w")
with open('/content/drive/MyDrive/Colab Notebooks/data/dev', "r") as test:
    for line in test:
        if not line.split():
            newfile.write('\n')
            continue
        index, word_type = line.split(" ")[0], line.split(" ")[1].strip('\n')
        for_tag = Index_to_Ner[y_pred[i]]
        newfile.write(str(index)+' '+str(word_type)+' '+for_tag+'\n')
        i += 1
newfile.close()


In [22]:
import pickle
with open('./vocab_dictionary.pickle','wb') as fw1:
    pickle.dump(Word_to_Index, fw1)
with open('./ner_dictionary.pickle','wb') as fw2:
    pickle.dump(NER_to_Index, fw2)
with open('./int_vocab_dictionary.pickle','wb') as fw3:
    pickle.dump(Index_to_Word, fw3)
with open('./int_ner_dictionary.pickle','wb') as fw4:
    pickle.dump(Index_to_Ner, fw4)
with open('./loader_train.pickle','wb') as fw5:
    pickle.dump(loader_train, fw5)
with open('./loader_dev.pickle','wb') as fw6:
    pickle.dump(loader_dev, fw6)
with open('./loader_test.pickle','wb') as fw7:
    pickle.dump(loader_test, fw7)

In [25]:
checkpoint = {'INPUT_DIM':len(Word_to_Index),
              'EMBEDDING_DIM':100,
              'HIDDEN_DIM':256,
              'FIRST_OUTPUT_DIM':128,
              'OUTPUT_DIM':len(NER_to_Index),
              'N_LAYERS':1,
              'BIDIRECTIONAL':True,
              'DROPOUT':0.33,
              'state_dict': model.state_dict()}

torch.save(checkpoint, './checkpoint.pth')