In [2]:
# Install dependencies
!pip install conllu



In [3]:
# import headers
from conllu import parse,parse_incr
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
from tqdm import tqdm

In [4]:
with open ("data/UD_English-Atis/en_atis-ud-train.conllu", "r", encoding="utf-8") as f:
    data = f.read()
train_sentences = parse(data)
with open ("data/UD_English-Atis/en_atis-ud-dev.conllu", "r", encoding="utf-8") as f:
    data = f.read()
dev_sentences = parse(data)
with open ("data/UD_English-Atis/en_atis-ud-test.conllu", "r", encoding="utf-8") as f:
    data = f.read()
test_sentences = parse(data)
print(train_sentences[0])

TokenList<what, is, the, cost, of, a, round, trip, flight, from, pittsburgh, to, atlanta, beginning, on, april, twenty, fifth, and, returning, on, may, sixth, metadata={sent_id: "0001.train", text: "what is the cost of a round trip flight from pittsburgh to atlanta beginning on april twenty fifth and returning on may sixth"}>


In [5]:
sentence = train_sentences[0]
sentence[1]

{'id': 2,
 'form': 'is',
 'lemma': 'be',
 'upos': 'AUX',
 'xpos': None,
 'feats': {'Mood': 'Ind',
  'Number': 'Sing',
  'Person': '3',
  'Tense': 'Pres',
  'VerbForm': 'Fin'},
 'head': 1,
 'deprel': 'cop',
 'deps': None,
 'misc': None}

In [6]:
def get_uniq_words(sentences):
    words = set()
    for sentence in sentences:
        for token in sentence:
            words.add(token["form"].lower())
    return words

In [7]:
uniq = get_uniq_words(train_sentences)

In [8]:
def read_embeddings(filename, vocab_size=10000,uniq_words=None):
    with open(filename, encoding="utf-8") as file:
        word_embedding_dim = len(file.readline().split(" ")) - 1
    vocab = {}
    embeddings = np.zeros((vocab_size, word_embedding_dim))
    last_idx = 0
    with open(filename, encoding="utf-8") as file:
        for idx, line in enumerate(file):
            if idx + 2 >= vocab_size:
                break
            cols = line.rstrip().split(" ")
            val = np.array(cols[1:])
            word = cols[0]
            embeddings[idx + 2] = val
            vocab[word] = idx + 2
            last_idx = idx + 2
        # global_vocab ={}
        # for idx, line in enumerate(file):
        #     cols = line.rstrip().split(" ")
        #     val = np.array(cols[1:])
        #     word = cols[0]
        #     global_vocab[word] = val
        
        # for word in uniq_words:
        #     if word not in vocab:
        #         if word in global_vocab:
        #             last_idx += 1
        #             vocab[word] = last_idx
        #             embeddings=np.append(embeddings,global_vocab[word])
    vocab["<UNK>"]=1
    vocab["<PAD>"]=0
    return torch.FloatTensor(embeddings), vocab

In [9]:
vocab_size = 50000
embeddings, vocab = read_embeddings('./data/glove.6B.100d.txt', vocab_size)

In [10]:
sentence[0]["form"].lower() in vocab

True

In [11]:
def build_vocab(sentences):
    data =[]
    word_set = set()
    vocab_dict={"<PAD>":0,"<UNK>":1}
    for sent in sentences:
        for token in sent:
            word_set.add(token["form"])
    word_list = sorted(list(word_set))
    for i,word in enumerate(word_list):
        vocab_dict[word]=i+2
    return vocab_dict
#vocab_dict=build_vocab(sentences)

In [12]:
def create_tags(sentences):
    tag_set=set()
    for sent in sentences:
        for token in sent:
            tag_set.add(token["upos"])
    tags=sorted(list(tag_set))
    tag_dict={"PAD":0}
    #tag_dict ={}
    for i,tag in enumerate(tags):
        tag_dict[tag]=i+1
    return tag_dict

total_sentences = train_sentences + dev_sentences + test_sentences    
tags = create_tags(total_sentences) 

In [13]:
tags

{'PAD': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'AUX': 4,
 'CCONJ': 5,
 'DET': 6,
 'INTJ': 7,
 'NOUN': 8,
 'NUM': 9,
 'PART': 10,
 'PRON': 11,
 'PROPN': 12,
 'SYM': 13,
 'VERB': 14}

In [14]:
import json
with open('tags.json', 'w') as fp:
    json.dump(tags, fp)

In [15]:
IGNORE_TAG_INDEX = 0
def create_data(sentences,vocab,tags,max_seq_len=50):
    sents_idx=[]
    sent_tags=[]
    for sent in sentences:
        sent_idx=[]
        sent_tag=[]
        for token in sent:
            if (token["form"].lower() in vocab):
                sent_idx.append(vocab[token["form"].lower()])
            else:
                sent_idx.append(vocab["<UNK>"])
            sent_tag.append(tags[token["upos"]])
        sents_idx.append(sent_idx)
        sent_tags.append(sent_tag)
    for i in range(len(sents_idx)):
        if len(sents_idx[i]) < max_seq_len:
            sents_idx[i]=sents_idx[i]+[vocab["<PAD>"] for _ in range(max_seq_len - len(sents_idx[i]))]
            sent_tags[i]=sent_tags[i]+[tags["PAD"] for _ in range(max_seq_len - len(sent_tags[i]))]
    return sents_idx,sent_tags

In [16]:
def sent_to_vector(sentence,max_seq_len=50):
    tokens = sentence.split(" ")
    sent_idx=[]
    for token in tokens:
        if (token.lower() in vocab):
            sent_idx.append(vocab[token.lower()])
        else:
            sent_idx.append(vocab["<UNK>"])
    for i in range(len(sent_idx)):
        if len(sent_idx) < max_seq_len:
            sent_idx=sent_idx+[vocab["<PAD>"] for _ in range(max_seq_len - len(sent_idx))]
    return sent_idx

In [17]:
sent_to_vector("Hi how are you")

[11085,
 199,
 34,
 83,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [18]:
x_train,y_train=create_data(train_sentences,vocab,tags,50)
x_dev,y_dev=create_data(dev_sentences,vocab,tags,50)
x_test,y_test=create_data(test_sentences,vocab,tags,50)
print("Train :"+str(len(x_train)) + " Dev : "+str(len(x_dev))+ " Test : "+str(len(x_test)))

Train :4274 Dev : 572 Test : 586


In [19]:
class POSDataSet(Dataset):
    def __init__(self, x, y):
        self.sent = torch.LongTensor(x)
        self.sent_tags = torch.LongTensor(y)
    
    def __getitem__(self, idx):
        return self.sent[idx], self.sent_tags[idx]
    
    def __len__(self):
        return len(self.sent)

In [20]:
train_dataset = POSDataSet(x_train,y_train)
dev_dataset = POSDataSet(x_dev,y_dev)
test_dataset = POSDataSet(x_test,y_test)

In [21]:
BATCH_SIZE =32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [22]:
test_dataset.sent

tensor([[104,  34,   2,  ...,   0,   0,   0],
        [ 43, 305,   9,  ...,   0,   0,   0],
        [ 43, 410,   9,  ...,   0,   0,   0],
        ...,
        [ 43, 410,   9,  ...,   0,   0,   0],
        [ 43, 305,   6,  ...,   0,   0,   0],
        [ 54, 457, 287,  ...,   0,   0,   0]])

In [23]:
# The accuracy function has been implemented for you
def accuracy(true, pred):
  """
  Arguments:
  - true:       a list of true label values (integers)
  - pred:       a list of predicted label values (integers)

  Output:
  - accuracy:   the prediction accuracy
  """
  true = np.array(true)
  pred = np.array(pred)

  num_correct = sum(true == pred)
  num_total = len(true)
  return num_correct / num_total

In [24]:
def set_seed(seed):
  """
  Sets random seeds and sets model in deterministic
  training mode. Ensures reproducible results
  """
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)

In [25]:
class POSTagger(nn.Module):
    def __init__(self,max_seq_len,embeddings,hidden_dim,n_layers,tagset_size,device="cuda"):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.num_labels = tagset_size
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding.from_pretrained(embeddings,padding_idx=0)
        self.lstm = nn.LSTM(input_size=embeddings.size()[1], hidden_size= self.hidden_dim , num_layers=n_layers)
        self.hidden2tag = nn.Linear(self.hidden_dim,self.num_labels)
        self.device = device
        self.to(device)
        
    def forward(self,input_seq):
        input_seq = input_seq.to(self.device)
        embed_out =self.embeddings(input_seq)
        lstm_out,_ = self.lstm(embed_out)
        logits = self.hidden2tag(lstm_out)
        return logits
    
    def evaluate(self,loader):
        self.eval()
        true_labels = []
        pred_labels = []
        for i, data in enumerate(loader):
            x,y = data
            logits = self.forward(x)
            pred_label=torch.argmax(logits, dim=-1).cpu().numpy()
            batch_size, _ = x.shape
            for j in range(batch_size):
                tags = y[j]
                pred = pred_label[j]
                for k in range(len(tags)):
                    if tags[k] != 0:
                        true_labels.append(tags[k])
                        pred_labels.append(pred[k])
        acc = accuracy(true_labels, pred_labels)  
        return acc ,true_labels ,pred_labels          

    def run_training(self,train_loader,dev_loader,epochs=100,learning_rate=5e-4,eval_every=5):
        if str(self.device) == 'cpu':
            print("Training only supported in GPU environment")
            return
        torch.cuda.empty_cache()
        self.to(self.device)
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        loss_function = nn.CrossEntropyLoss(ignore_index=0)
        for epoch in tqdm(range(epochs)):
            self.train()
            total_loss = 0
            for i, data in enumerate(train_loader):
                x,y=data
                self.zero_grad()
                logits = self.forward(x)
                labels = torch.LongTensor(y).to(self.device)
                loss = loss_function(logits.view(-1, self.num_labels), labels.view(-1))
                total_loss += loss
                loss.backward()
                optimizer.step()
            # print("Epoch {} | Loss: {}".format(epoch, total_loss))
            # if epoch % eval_every == eval_every-1:
            #     acc,_,_ = self.evaluate(dev_loader)
            #     print("Epoch {} | Accuracy: {}".format(epoch, acc))
        acc_train,_,_ = self.evaluate(train_loader)
        acc_val,true_labels,pred_labels = self.evaluate(dev_loader)
        print("# Model : Training Accuracy : {} Validation Accuracy: {} #".format(acc_train,acc_val))  
    
    def predict(self,data):
        x = torch.LongTensor(data)
        self.eval()
        predictions = []
        logits = self.forward(x)
        pred_label=torch.argmax(logits, dim=-1).cpu().numpy()
        batch_size, _ = x.shape
        for j in range(batch_size):
            labels=[]
            for k in range(len(x[j])):
                if x[j][k] != 0:
                    labels.append(pred_label[j][k])
            predictions.append(labels)
        return predictions

    
    def save(self,path):
        torch.save(self.state_dict(), path)

    def load(self,path):
        self.load_state_dict(torch.load(path))

In [40]:
set_seed(159)
tagger = POSTagger(50,embeddings,128,2,len(tags))
tagger.run_training(train_dataloader,dev_dataloader,50,0.0005,5)

100%|██████████| 50/50 [00:34<00:00,  1.44it/s]


# Model : Training Accuracy : 0.9678347549069982 Validation Accuracy: 0.9634256472004816 #


In [26]:
tagger.save("pos_tagger.pt")

In [27]:
acc,_,_=tagger.evaluate(test_dataloader)
print(acc)

0.9668693009118541


In [25]:
def predictor(model : POSTagger,sentence,tags):
    tokens = sentence.split(" ")
    inv_map = {v: k for k, v in tags.items()}
    vec = [sent_to_vector(sentence)]
    predictions = model.predict(vec)
    for i in range(len(tokens)):
        print (tokens[i]+"\t"+inv_map[predictions[0][i]])

In [1]:
predictor(tagger,"Mary had a little lamb",tags)

NameError: name 'predictor' is not defined

Hyper Parameter Tuning and model exploration

In [26]:
def tag_analysis(data_loader,model: POSTagger,tags):
    inv_map = {v: k for k, v in tags.items()}
    acc,true_labels,pred_labels=model.evaluate(data_loader)
    print(classification_report(true_labels,pred_labels,labels=list(tags.values()),target_names=list(tags.keys())))

In [36]:
tag_analysis(test_dataloader,tagger,tags)

              precision    recall  f1-score   support

         PAD       0.00      0.00      0.00         0
         ADJ       0.93      0.97      0.95       220
         ADP       0.98      0.99      0.98      1434
         ADV       0.95      0.72      0.82        76
         AUX       0.96      1.00      0.98       256
       CCONJ       1.00      1.00      1.00       109
         DET       0.98      0.87      0.92       512
        INTJ       0.97      1.00      0.99        36
        NOUN       0.99      0.97      0.98      1166
         NUM       0.95      0.96      0.95       127
        PART       0.72      0.52      0.60        56
        PRON       0.85      0.98      0.91       392
       PROPN       0.98      0.99      0.99      1567
         SYM       0.00      0.00      0.00         0
        VERB       0.98      0.97      0.98       629

   micro avg       0.97      0.97      0.97      6580
   macro avg       0.82      0.80      0.80      6580
weighted avg       0.97   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
class ImprovedPOSTagger(POSTagger):
    def __init__(self,max_seq_len,embeddings,hidden_dim,n_layers,tagset_size,dropout=0.8,bidirectional=True,device="cuda"):
        super().__init__(max_seq_len,embeddings,hidden_dim,n_layers,tagset_size,device)
        self.lstm = nn.LSTM(input_size=embeddings.size()[1], hidden_size=hidden_dim, dropout=dropout, num_layers=n_layers, bidirectional=bidirectional)
        self.hidden2tag = nn.Linear(self.hidden_dim*2,self.num_labels)

In [27]:
tagger2 = ImprovedPOSTagger(50,embeddings,128,2,len(tags),0.3)
tagger2.run_training(train_dataloader,dev_dataloader,100,0.0005,5)

NameError: name 'ImprovedPOSTagger' is not defined

In [32]:
tagger2.save('pos_tagger2.pt')

In [33]:
acc,_,_=tagger2.evaluate(test_dataloader)
print("Test accuracy for improved model: "+str(acc))

Test accuracy for improved model: 0.9682370820668693


In [34]:
def analyzer(train_dataloader,dev_dataloader,test_dataloader,tags):
    lrs = [0.0005]
    layers = [2]
    hidden_dims = [128]
    dropouts = [0.2,0.5,0.8]
    best_accuracy = 0
    best_config = {"lr":0,"layers":0,"hidden_dim":0,"dropout":0}
    for lr in lrs:
        for layer in layers:
            for hidden_dim in hidden_dims:
                for dropout in dropouts:
                    set_seed(159)
                    tagger = ImprovedPOSTagger(50,embeddings,hidden_dim,layer,len(tags),dropout=dropout)
                    tagger.run_training(train_dataloader,dev_dataloader,50,lr,5)
                    acc,true_labels,pred_labels=tagger.evaluate(test_dataloader)
                    #score = f1_score(true_labels,pred_labels,len(tags))
                    #cf = confusion_matrix(true_labels,pred_labels,len(tags))
                    print("-------------------------------------------------------------------------------------")
                    print("# Model Parameters | Learning Rate : {} Layers : {} Hidden Dim : {} #".format(lr,layer,hidden_dim))
                    print("# Analysis | Test Accuracy : {} #".format(acc))
                    # print("# Confusion Matrix : # : ")
                    # print(cf)
                    print("-------------------------------------------------------------------------------------")
                    #tag_analysis(test_dataloader,tagger,tags)
                    if acc > best_accuracy:
                        tagger.save("pos_tagger.pt")
                        best_accuracy = acc
                        best_config = {"lr":lr,"layers":layer,"hidden_dim":hidden_dim}
                
    print("Best Accuracy : {} Best Config : {}".format(best_accuracy,best_config))

In [35]:
analyzer(train_dataloader,dev_dataloader,test_dataloader,tags)

100%|██████████| 50/50 [00:50<00:00,  1.01s/it]


# Model : Training Accuracy : 0.9697461720275409 Validation Accuracy: 0.9629741119807345 #


  0%|          | 0/50 [00:00<?, ?it/s]

-------------------------------------------------------------------------------------
# Model Parameters | Learning Rate : 0.0005 Layers : 2 Hidden Dim : 128 #
# Analysis | Test Accuracy : 0.967629179331307 #
-------------------------------------------------------------------------------------


100%|██████████| 50/50 [00:48<00:00,  1.04it/s]


# Model : Training Accuracy : 0.969705066283013 Validation Accuracy: 0.9641782059000602 #


  0%|          | 0/50 [00:00<?, ?it/s]

-------------------------------------------------------------------------------------
# Model Parameters | Learning Rate : 0.0005 Layers : 2 Hidden Dim : 128 #
# Analysis | Test Accuracy : 0.967629179331307 #
-------------------------------------------------------------------------------------


100%|██████████| 50/50 [00:55<00:00,  1.12s/it]


# Model : Training Accuracy : 0.9686157640530264 Validation Accuracy: 0.963275135460566 #
-------------------------------------------------------------------------------------
# Model Parameters | Learning Rate : 0.0005 Layers : 2 Hidden Dim : 128 #
# Analysis | Test Accuracy : 0.9665653495440729 #
-------------------------------------------------------------------------------------
Best Accuracy : 0.967629179331307 Best Config : {'lr': 0.0005, 'layers': 2, 'hidden_dim': 128}
