In [34]:
import numpy as np

In [29]:
# Load up text as list of lists of string tokens.
with open("15pctmasked.txt") as f:
    lines = f.readlines()
    sentences = []
    for line in lines:
        sentences.append(line.split())

In [39]:
# Load up bigram model as dict of dicts mapping to log probabilities, as well as vocabulary.
with open("lm.txt") as f:
    model = {}
    vocab = {}
    index = 0
    for line in f.readlines():
        components = line.split()
        model[components[0]] = {}
        model[components[0]][components[1]] = np.log(float(components[2]))
        if components[0] not in vocab:
            vocab[components[0]] = index
            index += 1
            
    # Fill out unseen words.
    for key1 in model:
        for key2 in vocab:
            if key2 not in model[key1]:
                model[key1][key2] = -np.inf

In [41]:
MASK = "<mask>"
START = "<start>"

In [49]:
def decode(sentence, model, vocab):
    """
    Decode '<mask>' tokens using Viterbi algorithm.
    """
    seq_len = len(sentence)
    decoded = []
    viterbi = np.zeros((len(vocab), seq_len - 1))
    
    # Set score functions s[index, word2, word2] based on masked and unmasked sentences.
    
    # Set first Viterbi variables.
    for k, key in enumerate(vocab):
        # If the word is masked, we fill in probabilities, otherwise 0 for the unmasked word and -infty otherwise.
        if sentence[1] == MASK:
            viterbi[k, 0] = model[START][key]
        else:
            viterbi[k, 0] = 0 if key == sentence[1] else -np.inf
        
    # Forward pass.
    for i in range(1, seq_len - 1):
        for k, key in enumerate(vocab):
            # If the word is masked, we fill in probabilities, otherwise 0 for the unmasked word and -infty otherwise.
            viterbi[k, i] = model[sentence[i]][key] + viterbi[k, i - 1]
            
        
    return decoded

In [50]:
sentence = sentences[0]

print(sentence)

decode(sentence, model, vocab)

['<start>', 'I', '<mask>', 'p', '<mask>', '<mask>', 'm', '<mask>', 'n', 't', 'a', 't', 'i', 'o', '<mask>', '<s>', 'o', 'f', '<s>', 'G', 'e', 'o', 'r', 'g', 'i', 'a', "'", '<mask>', '<s>', 'a', 'u', '<mask>', 'o', 'm', 'o', 'b', 'i', 'l', 'e', '<s>', '<mask>', 'i', 't', 'l', 'e', '<mask>', 'l', 'a', 'w', '<s>', 'w', 'a', 's', '<mask>', 'a', 'l', '<mask>', '<mask>', '<s>', '<mask>', 'e', 'c', 'o', 'm', 'm', 'e', 'n', 'd', 'e', 'd', '<s>', 'b', '<mask>', '<s>', '<mask>', 'h', 'e', '<s>', 'o', 'u', 't', 'g', 'o', 'i', '<mask>', 'g', '<s>', 'j', '<mask>', 'r', 'y', '<mask>', '.', '<eos>']
[-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf   0.
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -in

[]