In [10]:
# Install dependencies
!pip install conllu



In [11]:
# import headers
from conllu import parse,parse_incr
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
from tqdm import tqdm

In [12]:
with open ("data/UD_English-Atis/en_atis-ud-train.conllu", "r", encoding="utf-8") as f:
    data = f.read()
train_sentences = parse(data)
with open ("data/UD_English-Atis/en_atis-ud-test.conllu", "r", encoding="utf-8") as f:
    data = f.read()
dev_sentences = parse(data)
with open ("data/UD_English-Atis/en_atis-ud-test.conllu", "r", encoding="utf-8") as f:
    data = f.read()
test_sentences = parse(data)
print(train_sentences[0])

TokenList<what, is, the, cost, of, a, round, trip, flight, from, pittsburgh, to, atlanta, beginning, on, april, twenty, fifth, and, returning, on, may, sixth, metadata={sent_id: "0001.train", text: "what is the cost of a round trip flight from pittsburgh to atlanta beginning on april twenty fifth and returning on may sixth"}>


In [13]:
sentence = train_sentences[0]
sentence[1]

{'id': 2,
 'form': 'is',
 'lemma': 'be',
 'upos': 'AUX',
 'xpos': None,
 'feats': {'Mood': 'Ind',
  'Number': 'Sing',
  'Person': '3',
  'Tense': 'Pres',
  'VerbForm': 'Fin'},
 'head': 1,
 'deprel': 'cop',
 'deps': None,
 'misc': None}

In [14]:
def read_embeddings(filename, vocab_size=10000):
    with open(filename, encoding="utf-8") as file:
        word_embedding_dim = len(file.readline().split(" ")) - 1
    vocab = {}
    embeddings = np.zeros((vocab_size, word_embedding_dim))
    with open(filename, encoding="utf-8") as file:
        for idx, line in enumerate(file):
            if idx + 2 >= vocab_size:
                break
            cols = line.rstrip().split(" ")
            val = np.array(cols[1:])
            word = cols[0]
            embeddings[idx + 2] = val
            vocab[word] = idx + 2
    vocab["<UNK>"]=1
    vocab["<PAD>"]=0
  # a FloatTensor is a multidimensional matrix
  # that contains 32-bit floats in every entry
  # https://pytorch.org/docs/stable/tensors.html
    return torch.FloatTensor(embeddings), vocab

In [15]:
vocab_size = 50000
embeddings, vocab = read_embeddings('./data/glove.6B.50d.txt', vocab_size)

In [16]:
sentence[0]["form"].lower() in vocab

True

In [17]:
def build_vocab(sentences):
    data =[]
    word_set = set()
    vocab_dict={"<PAD>":0,"<UNK>":1}
    for sent in sentences:
        for token in sent:
            word_set.add(token["form"])
    word_list = sorted(list(word_set))
    for i,word in enumerate(word_list):
        vocab_dict[word]=i+2
    return vocab_dict
#vocab_dict=build_vocab(sentences)

In [18]:
def create_tags(sentences):
    tag_set=set()
    for sent in sentences:
        for token in sent:
            tag_set.add(token["upos"])
    tags=sorted(list(tag_set))
    tag_dict={"PAD":0}
    #tag_dict ={}
    for i,tag in enumerate(tags):
        tag_dict[tag]=i+1
    return tag_dict
    
tags = create_tags(train_sentences) 

In [19]:
tags

{'PAD': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'AUX': 4,
 'CCONJ': 5,
 'DET': 6,
 'INTJ': 7,
 'NOUN': 8,
 'NUM': 9,
 'PART': 10,
 'PRON': 11,
 'PROPN': 12,
 'VERB': 13}

In [20]:
IGNORE_TAG_INDEX = 0
def create_data(sentences,vocab,tags,max_seq_len=50):
    sents_idx=[]
    sent_tags=[]
    #present=0
    #not_present=0
    for sent in sentences:
        sent_idx=[]
        sent_tag=[]
        for token in sent:
            if (token["form"].lower() in vocab):
                sent_idx.append(vocab[token["form"].lower()])
            else:
                sent_idx.append(vocab["<UNK>"])
            sent_tag.append(tags[token["upos"]])
        sents_idx.append(sent_idx)
        sent_tags.append(sent_tag)
    for i in range(len(sents_idx)):
        if len(sents_idx[i]) < max_seq_len:
            sents_idx[i]=sents_idx[i]+[vocab["<PAD>"] for _ in range(max_seq_len - len(sents_idx[i]))]
            sent_tags[i]=sent_tags[i]+[tags["PAD"] for _ in range(max_seq_len - len(sent_tags[i]))]
#     print(present)
#     print(not_present)
    return sents_idx,sent_tags

In [21]:
def sent_to_vector(sentence,max_seq_len=50):
    tokens = sentence.split(" ")
    sent_idx=[]
    for token in tokens:
        if (token.lower() in vocab):
            sent_idx.append(vocab[token.lower()])
        else:
            sent_idx.append(vocab["<UNK>"])
    for i in range(len(sent_idx)):
        if len(sent_idx) < max_seq_len:
            sent_idx=sent_idx+[vocab["<PAD>"] for _ in range(max_seq_len - len(sent_idx))]
    return sent_idx

In [22]:
sent_to_vector("Hi how are you")

[11085,
 199,
 34,
 83,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [23]:
x_train,y_train=create_data(train_sentences,vocab,tags,50)
x_dev,y_dev=create_data(dev_sentences,vocab,tags,50)
x_test,y_test=create_data(test_sentences,vocab,tags,50)
print("Train :"+str(len(x_train)) + " Dev : "+str(len(x_dev))+ " Test : "+str(len(x_test)))

Train :4274 Dev : 586 Test : 586


In [24]:
class POSDataSet(Dataset):
    def __init__(self, x, y):
        self.sent = torch.LongTensor(x)
        self.sent_tags = torch.LongTensor(y)
    
    def __getitem__(self, idx):
        return self.sent[idx], self.sent_tags[idx]
    
    def __len__(self):
        return len(self.sent)

In [25]:
train_dataset = POSDataSet(x_train,y_train)
dev_dataset = POSDataSet(x_dev,y_dev)
test_dataset = POSDataSet(x_test,y_test)

In [26]:
BATCH_SIZE =32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [27]:
test_dataset.sent

tensor([[104,  34,   2,  ...,   0,   0,   0],
        [ 43, 305,   9,  ...,   0,   0,   0],
        [ 43, 410,   9,  ...,   0,   0,   0],
        ...,
        [ 43, 410,   9,  ...,   0,   0,   0],
        [ 43, 305,   6,  ...,   0,   0,   0],
        [ 54, 457, 287,  ...,   0,   0,   0]])

In [28]:
# The accuracy function has been implemented for you
def accuracy(true, pred):
  """
  Arguments:
  - true:       a list of true label values (integers)
  - pred:       a list of predicted label values (integers)

  Output:
  - accuracy:   the prediction accuracy
  """
  true = np.array(true)
  pred = np.array(pred)

  num_correct = sum(true == pred)
  num_total = len(true)
  return num_correct / num_total

In [29]:
def confusion_matrix(true, pred, num_tags):
  """
  Arguments:
  - true:       a list of true label values (integers)
  - pred:       a list of predicted label values (integers)
  - num_tags:   the number of possible tags
                true and pred will both contain integers between
                0 and num_tags - 1 (inclusive)

  Output: 
  - confusion_matrix:   a (num_tags x num_tags) matrix of integers

  confusion_matrix[i][j] = # predictions where true label
  was i and predicted label was j

  """

  confusion_matrix = np.zeros((num_tags, num_tags))

  #############################
  """
  for tag in np.arange(len(num_tags)):
    tp.append(sum(true == pred == tag))
    fp.append(sum(pred == tag and true != tag))
    fn.append(sum(pred != tag and true == tag))
  
  for i in np.arange(num_tags):
    for j in np.arange(num_tags):
      if i == j:
        confusion_matrix[i][j] = sum(np.logical_and(true == i, pred == i))
      else:
        confusion_matrix[i][j] = sum(np.logical_and(pred == j, true == i))
  """
  for t in np.arange(len(true)):
    i = true[t]
    j = pred[t]
    confusion_matrix[i][j] += 1  
  #############################
  return confusion_matrix

In [30]:
def precision(true, pred, num_tags):
  """
  Arguments:
  - true:       a list of true label values (integers)
  - pred:       a list of predicted label values (integers)
  - num_tags:   the number of possible tags
                true and pred will both contain integers between
                0 and num_tags - 1 (inclusive)

  Output: 
  - precision:  an array of length num_tags, where precision[i]
                gives the precision of class i

  Hints:  the confusion matrix may be useful
          be careful about zero division
  """

  precision = np.zeros(num_tags)

  #############################
  cm = confusion_matrix(true, pred, num_tags)
  for i in np.arange(num_tags):
    if (sum([cm[x][i] for x in np.arange(num_tags)])==0):
      precision[i] = 0
    else:
      precision[i] = cm[i][i]/sum([cm[x][i] for x in np.arange(num_tags)])
  #############################
  return precision

In [31]:
def recall(true, pred, num_tags):
  """
  Arguments:
  - true:       a list of true label values (integers)
  - pred:       a list of predicted label values (integers)
  - num_tags:   the number of possible tags
                true and pred will both contain integers between
                0 and num_tags - 1 (inclusive)

  Output: 
  - recall:     an array of length num_tags, where recall[i]
                gives the recall of class i

  Hints:  the confusion matrix may be useful
          be careful about zero division
  """

  """
  YOUR CODE HERE
  """
  recall = np.zeros(num_tags)

  #############################
  cm = confusion_matrix(true, pred, num_tags)
  for i in np.arange(num_tags):
    if (sum([cm[i][x] for x in np.arange(num_tags)])==0):
      recall[i] = 0
    else:
      recall[i] = cm[i][i]/sum([cm[i][x] for x in np.arange(num_tags)])
  #############################
  return recall

In [32]:
def f1_score(true, pred, num_tags):
  """
  Arguments:
  - true:       a list of true label values (integers)
  - pred:       a list of predicted label values (integers)
  - num_tags:   the number of possible tags
                true and pred will both contain integers between
                0 and num_tags - 1 (inclusive)

  Output: 
  - f1:         an array of length num_tags, where f1[i]
                gives the recall of class i
  """
  f1 = np.zeros(num_tags)

  #############################
  p = precision(true, pred, num_tags)
  r = recall(true, pred, num_tags)

  for i in np.arange(num_tags):
    if p[i]+r[i] == 0:
      f1[i] = 0
    else:
      f1[i]= (2*(p[i]*r[i])/(p[i]+r[i]))
  #############################
  return f1

In [33]:
def set_seed(seed):
  """
  Sets random seeds and sets model in deterministic
  training mode. Ensures reproducible results
  """
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)

In [34]:
class POSTagger(nn.Module):
    def __init__(self,max_seq_len,embeddings,hidden_dim,n_layers,tagset_size):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.num_labels = tagset_size
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding.from_pretrained(embeddings,padding_idx=0)
        self.lstm = nn.LSTM(input_size=embeddings.size()[1], hidden_size= self.hidden_dim , num_layers=n_layers)
        self.hidden2tag = nn.Linear(self.hidden_dim,self.num_labels)
        
    def forward(self,input_seq):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        input_seq = input_seq.to(device)
        embed_out =self.embeddings(input_seq)
        #packed_embeddings = pack_padded_sequence(embed_out, self.max_seq_len, batch_first=True, enforce_sorted=False)
        lstm_out,_ = self.lstm(embed_out)
        #padded_output, _ = pad_packed_sequence(lstm_out, batch_first=True)
        logits = self.hidden2tag(lstm_out)
        return logits
    
    def evaluate(self,loader):
        self.eval()
        true_labels = []
        pred_labels = []
        for i, data in enumerate(loader):
            x,y = data
            logits = self.forward(x)
            pred_label=torch.argmax(logits, dim=-1).cpu().numpy()
            batch_size, _ = x.shape
            for j in range(batch_size):
                tags = y[j]
                pred = pred_label[j]
                for k in range(len(tags)):
                    if tags[k] != 0:
                        true_labels.append(tags[k])
                        pred_labels.append(pred[k])
        acc = accuracy(true_labels, pred_labels)  
        return acc ,true_labels ,pred_labels          

    def run_training(self,train_loader,dev_loader,epochs=100,learning_rate=5e-4,eval_every=5):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if str(device) == 'cpu':
            print("Training only supported in GPU environment")
            return
        torch.cuda.empty_cache()
        self.to(device)
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        loss_function = nn.CrossEntropyLoss(ignore_index=0)
        for epoch in range(epochs):
            self.train()
            total_loss = 0
            for i, data in enumerate(train_loader):
                x,y=data
                self.zero_grad()
                logits = self.forward(x)
                labels = torch.LongTensor(y).to(device)
                loss = loss_function(logits.view(-1, self.num_labels), labels.view(-1))
                total_loss += loss
                loss.backward()
                optimizer.step()
            print("Epoch {} | Loss: {}".format(epoch, total_loss))
            if epoch % eval_every == 0:
                acc,_,_ = self.evaluate(dev_loader)
                print("Epoch {} | Accuracy: {}".format(epoch, acc))         
    
    def predict(self,data):
        x = torch.LongTensor(data)
        self.eval()
        predictions = []
        logits = self.forward(x)
        pred_label=torch.argmax(logits, dim=-1).cpu().numpy()
        batch_size, _ = x.shape
        for j in range(batch_size):
            labels=[]
            for k in range(len(x[j])):
                if x[j][k] != 0:
                    labels.append(pred_label[j][k])
            predictions.append(labels)
        return predictions

    
    def save(self,path):
        torch.save(self.state_dict(), path)

    def load(self,path):
        self.load_state_dict(torch.load(path))

In [35]:
set_seed(159)
tagger = POSTagger(50,embeddings,128,2,len(tags))
tagger.run_training(train_dataloader,dev_dataloader,50,0.0005,5)

Epoch 0 | Loss: 238.99752807617188
Epoch 0 | Accuracy: 0.6560790273556231
Epoch 1 | Loss: 115.08439636230469
Epoch 2 | Loss: 71.01158142089844
Epoch 3 | Loss: 51.02167510986328
Epoch 4 | Loss: 40.90391159057617
Epoch 5 | Loss: 35.04903793334961
Epoch 5 | Accuracy: 0.9293313069908815
Epoch 6 | Loss: 31.07115936279297
Epoch 7 | Loss: 28.149417877197266
Epoch 8 | Loss: 25.518945693969727
Epoch 9 | Loss: 23.328821182250977
Epoch 10 | Loss: 21.715444564819336
Epoch 10 | Accuracy: 0.9519756838905775
Epoch 11 | Loss: 20.4924373626709
Epoch 12 | Loss: 19.319236755371094
Epoch 13 | Loss: 18.61332893371582
Epoch 14 | Loss: 17.782766342163086
Epoch 15 | Loss: 17.13642692565918
Epoch 15 | Accuracy: 0.95790273556231
Epoch 16 | Loss: 16.565166473388672
Epoch 17 | Loss: 16.338045120239258
Epoch 18 | Loss: 15.776994705200195
Epoch 19 | Loss: 15.49870777130127
Epoch 20 | Loss: 15.151619911193848
Epoch 20 | Accuracy: 0.9604863221884499
Epoch 21 | Loss: 14.860315322875977
Epoch 22 | Loss: 14.746910095214

In [36]:
tagger.save("pos_tagger.pt")

In [37]:
acc,_,_=tagger.evaluate(test_dataloader)
print(acc)

0.9650455927051672


In [38]:
def predictor(model,sentence,tags):
    tokens = sentence.split(" ")
    inv_map = {v: k for k, v in tags.items()}
    vec = [sent_to_vector(sentence)]
    predictions = model.predict(vec)
    for i in range(len(tokens)):
        print (tokens[i]+"\t"+inv_map[predictions[0][i]])


In [39]:
predictor(tagger,"i want a flight from nashville to seattle that arrives no later than 3 pm",tags)

i	PRON
want	VERB
a	DET
flight	NOUN
from	ADP
nashville	PROPN
to	ADP
seattle	PROPN
that	ADP
arrives	VERB
no	DET
later	ADV
than	ADP
3	NUM
pm	NOUN


In [109]:
tags

{'PAD': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'AUX': 4,
 'CCONJ': 5,
 'DET': 6,
 'INTJ': 7,
 'NOUN': 8,
 'NUM': 9,
 'PART': 10,
 'PRON': 11,
 'PROPN': 12,
 'VERB': 13}