## Create POS dataset from NLTK

In [17]:
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

import numpy as np

import torch.autograd as autograd
from tqdm.notebook import trange
from tqdm.notebook import tqdm
from collections import Counter

#### Download POS tagger

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tyler/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
text = nltk.word_tokenize("And now for something completely different")

In [9]:
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

#### Download and tag corpus

In [10]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/tyler/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [11]:
nltk.corpus.brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [12]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

In [13]:
all_text = ''.join(f'{word} ' for word in text)

In [14]:
all_text[:100]

"the fulton county grand jury said friday an investigation of atlanta's recent primary election produ"

In [15]:
sent_text = nltk.sent_tokenize(all_text)

In [18]:
tagged_sentences = [nltk.pos_tag(s.split()) for s in tqdm(sent_text)]

HBox(children=(FloatProgress(value=0.0, max=56534.0), HTML(value='')))




#### Decide which tags to keep

In [19]:
tag_counts = {}

tag_list = []
for sentence in tagged_sentences:
    for word,tag in sentence:
        tag_list.append(tag)
        
tag_counts = Counter(tag_list)

In [20]:
keep_tags = []
for key,item in tag_counts.most_common(18):
    if key != '.' and key != ',' and '$' not in key:
        keep_tags.append(key)

In [21]:
len(keep_tags)

15

#### Create training data in same format from tutorial

In [55]:
training_data = []
for sentence in tagged_sentences[:5000]:
    running_sentence,running_tag = [],[]
    for word,tag in sentence:
        if tag in keep_tags:
            running_sentence.append(word)
            running_tag.append(tag)
    if len(running_sentence) > 3:
        training_data.append((running_sentence,running_tag))

In [86]:
validation_data = []
for sentence in tagged_sentences[10000:12000]:
    running_sentence,running_tag = [],[]
    for word,tag in sentence:
        if tag in keep_tags:
            running_sentence.append(word)
            running_tag.append(tag)
    if len(running_sentence) > 3:
        validation_data.append((running_sentence,running_tag))

In [87]:
len(training_data),len(validation_data)

(4774, 1812)

In [88]:
#training_data[0]

## Show example

In [89]:
import tensorboard
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from glob import glob

In [90]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [91]:
def get_run_version():
    files = glob('logs/*')
    return f'v{str(len(files))}'

In [93]:
log_dir = get_run_version()

writer = SummaryWriter(f'logs/{log_dir}')
print(log_dir)

v2


In [94]:
def prepare_sequence_oov(seq,to_ix):
    idxs = [to_ix[w] for w in seq if w in to_ix else to_ix['UNK']]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_sequence_char(seq, to_ix):
    sentence_tensor = []
    for word in seq:
        idxs = [to_ix[w] for w in word]
        word_tensor = torch.tensor(idxs, dtype=torch.long)
        sentence_tensor.append(word_tensor)
    return sentence_tensor

def sent_to_char(sent):
    char_list = []
    for word in sent:
        word_chars = []
        for character in word:
            word_chars.append(character)
        char_list.append(word_chars)
    return char_list

In [95]:
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [96]:
tag_to_ix = {tag:idx for idx,tag in enumerate(keep_tags)}

In [97]:
ix_to_tag = {ix:tag for tag,ix in tag_to_ix.items()}
ix_to_word = {ix:word for word,ix in word_to_ix.items()}

In [75]:
char_to_ix = {}
for sent, tags in training_data:
    for word in sent_to_char(sent):
        for char in word:
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)

In [76]:
class char_lstm(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, vocab_size_char, tagset_size):
        super(char_lstm, self).__init__()
        self.hidden_dim = hidden_dim

        self.char_embeddings = nn.Embedding(vocab_size_char, embedding_dim)
        self.lstm_char = nn.LSTM(embedding_dim, hidden_dim)
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
                
        self.hidden = self.init_hidden(self.hidden_dim)
        self.hidden_char = self.init_hidden(self.hidden_dim)

    def forward(self, sentence_in, char_in):
        num_words = sentence_in.shape[0]
        embeds = self.word_embeddings(sentence_in)
                
        char_output_list = []
        for word_chars in char_in:
            
            ## reset the hidden layer to remove make the representation for consecutive words independent
            self.hidden_char = self.init_hidden(self.hidden_dim)
            char_emb = self.char_embeddings(word_chars)
            lstm_out_1, self.hidden_char = self.lstm_char(char_emb.view(char_emb.shape[0], 1, -1),self.hidden_char)        
            
            ## this keeps only the last output from each charcter sequence
            char_output_list.append(lstm_out_1[-1])

        char_output_tensor = torch.cat(char_output_list)

        merged = torch.cat([embeds, char_output_tensor], dim=1) 
        
        self.hidden = self.init_hidden(self.hidden_dim)
        
        lstm_out_2, self.hidden = self.lstm(merged.view(num_words, 1, -1),self.hidden)
        lstm_out_2 = lstm_out_2.view(num_words,-1)
        
        tag_scores = self.hidden2tag(lstm_out_2)        
        tag_scores = F.log_softmax(tag_scores, dim=1)
        
        return tag_scores
    
    def init_hidden(self,size):
        return (torch.zeros(1, 1, size),torch.zeros(1, 1, size))

In [77]:
EMBEDDING_DIM,HIDDEN_DIM = 6,10
model = char_lstm(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(char_to_ix), len(tag_to_ix))

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [78]:
## Try a forward pass
for sentence, tags in training_data:
    sentence_in = prepare_sequence(sentence, word_to_ix)
    char_in = prepare_sequence_char(sent_to_char(sentence), char_to_ix)
    targets = prepare_sequence(tags, tag_to_ix)
    tag_scores = model(sentence_in,char_in)
    break

In [102]:
EMBEDDING_DIM,HIDDEN_DIM = 6,10
model = char_lstm(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(char_to_ix), len(tag_to_ix))

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in trange(num_epochs):
    model.zero_grad()
    train_loss,val_loss = [],[]
    for sentence, tags in training_data:
        
        sentence_in = prepare_sequence(sentence, word_to_ix)
        char_in = prepare_sequence_char(sent_to_char(sentence), char_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        tag_scores = model(sentence_in,char_in)

        loss = loss_function(tag_scores, targets)
        train_loss.append(loss.item())
        
        loss.backward()
        optimizer.step()
        
#     with torch.no_grad():
#         for sentence, tags in validation_data:
#             sentence_in = prepare_sequence(sentence, word_to_ix)
#             char_in = prepare_sequence_char(sent_to_char(sentence), char_to_ix)
#             targets = prepare_sequence(tags, tag_to_ix)

#             tag_scores = model(sentence_in,char_in)

#             loss = loss_function(tag_scores, targets)
#             val_loss.append(loss.item())        
        
#     mean_val_loss = np.mean(val_loss)
#     writer.add_scalar('Loss - val',mean_val_loss, epoch)

    mean_train_loss = np.mean(train_loss)
    writer.add_scalar('Loss - train',mean_train_loss, epoch)
    
    #print(f'Epoch {epoch+1}/{num_epochs}: train loss of {mean_train_loss:.3f}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [103]:
# See what the scores are after training
with torch.no_grad():
    for sentence, tags in training_data[:5]:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        char_in = prepare_sequence_char(sent_to_char(sentence), char_to_ix)
        tag_scores = model(sentence_in,char_in)

        preds_ix = np.argmax(tag_scores.detach().numpy(),axis=1)
        preds = [ix_to_tag[p] for p in preds_ix]
        print(sentence)
        
        print(f'Preds: {preds}')
        print(f'Tags:  {tags}')
        
        print(f'-------------------------------')

['the', 'fulton', 'county', 'grand', 'jury', 'said', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']
Preds: ['DT', 'JJ', 'NN', 'JJ', 'NN', 'VBD', 'DT', 'NN', 'IN', 'JJ', 'JJ', 'RB', 'NN', 'VBN', 'DT', 'NN', 'IN', 'DT', 'JJ', 'VBD', 'VBN']
Tags:  ['DT', 'NN', 'NN', 'JJ', 'NN', 'VBD', 'DT', 'NN', 'IN', 'JJ', 'JJ', 'JJ', 'NN', 'VBD', 'DT', 'NN', 'IN', 'DT', 'NNS', 'VBD', 'NN']
-------------------------------
['the', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'city', 'executive', 'committee', 'had', 'over-all', 'charge', 'of', 'the', 'election', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', 'for', 'the', 'manner', 'in', 'the', 'election', 'was', 'conducted']
Preds: ['DT', 'NN', 'RB', 'VBD', 'IN', 'VBN', 'NNS', 'IN', 'DT', 'NN', 'NN', 'NN', 'VBD', 'JJ', 'NN', 'IN', 'DT', 'NN', 'JJ', 'DT', 'NN', 'CC', 'NNS', 'IN', 'D

In [115]:
n_correct = 0
total = 0

with torch.no_grad():
    for sentence, tags in training_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        char_in = prepare_sequence_char(sent_to_char(sentence), char_to_ix)
        tag_scores = model(sentence_in,char_in)

        preds_ix = np.argmax(tag_scores.detach().numpy(),axis=1)
        preds = [ix_to_tag[p] for p in preds_ix]
        
        for prediction,label in zip(preds,tags):
            if prediction == label:
                n_correct += 1
            total += 1
    
    acc = n_correct / total
    print(f'Train Accuracy of {round(acc*100,2)}')
    
    ## F1 would be a better metric here

Train Accuracy of 82.59


In [116]:
n_correct,total

(78553, 95108)