## How to treat for out of vocabulary words
* Replace missing words with UNK token
* Too many unknowns will make perplexity lower, 

In [20]:
from collections import Counter

In [21]:
# If count of occurence of a word is less than 3 then it's an unknown for us
vocabulary_target_size = 3

In [22]:
counts = Counter({"happy": 5,
          "because": 3,
          "i":2,
          "am":2,
          "learning": 3,
          ".": 1})

In [23]:
counts

Counter({'happy': 5, 'because': 3, 'i': 2, 'am': 2, 'learning': 3, '.': 1})

In [24]:
vocabulary = counts.most_common(vocabulary_target_size)

In [25]:
# Let's assume this is the final vocabulary
vocabulary

[('happy', 5), ('because', 3), ('learning', 3)]

## Replacing with UNK token

In [26]:
original = "am i learning".split()

output = []

for word in original:
    word = word if word in vocabulary else "<UNK>"
    output.append(word)

print(f"Original: {original}")
print(f"Processed: {output}")

Original: ['am', 'i', 'learning']
Processed: ['<UNK>', '<UNK>', '<UNK>']


## Add 1-K smoothing
* It does not do anything good, as probability for trigram with know probabilities and with unknown probabilities are the same

In [27]:
def add_k_smoothing(k: int, vocabulary_size: int, n_gram_count: int,
                    n_gram_prefix_count: int) -> float:
    return  ((n_gram_count + k)/
             (n_gram_prefix_count + k * vocabulary_size))

In [28]:
trigram_probabilities = {('i', 'am', 'happy') : 2}
bigram_probabilities = {( 'am', 'happy') : 10}
vocabulary_size = 5
k = 1

probability_known_trigram = add_k_smoothing(k, vocabulary_size, trigram_probabilities[('i', 'am', 'happy')], 
                           bigram_probabilities[( 'am', 'happy')])

probability_unknown_trigram = add_k_smoothing(k, vocabulary_size, 0, 0)

print(f"probability_known_trigram: {probability_known_trigram: 0.03f}")
print(f"probability_unknown_trigram: {probability_unknown_trigram: 0.03f}")

probability_known_trigram:  0.200
probability_unknown_trigram:  0.200


## Back Off method
* If probability is not known then replace the n gram probability with n-1 gram probability

In [29]:
# Precaculated probabilities
trigram_probabilities = {('i', 'am', 'happy'): 0}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}

In [30]:
# Probability that we want to compute
trigram = ('are', 'you', 'happy')
bigram, unigram = trigram[1: 3], trigram[2]

In [31]:
lambda_factor = 0.4
probability_hat_trigram = 0

# search for first non-zero probability starting with the trigram
# to generalize this for any order of n-gram hierarchy, 
# you could loop through the probability dictionaries instead of if/else cascade
if trigram not in trigram_probabilities or trigram_probabilities[trigram] == 0:
    print(f"probability for trigram {trigram} not found")

    if bigram not in bigram_probabilities or bigram_probabilities[bigram] == 0:
        print(f"probability for bigram {bigram} not found")

        if unigram in unigram_probabilities:
            print(f"probability for unigram {unigram} found\n")
            probability_hat_trigram = lambda_factor * lambda_factor * unigram_probabilities[unigram]
        else:
            probability_hat_trigram = 0
    else:
        probability_hat_trigram = lambda_factor * bigram_probabilities[bigram]
else:
    probability_hat_trigram = trigram_probabilities[trigram]

print(f"probability for trigram {trigram} estimated as {probability_hat_trigram:0.3f}")

probability for trigram ('are', 'you', 'happy') not found
probability for bigram ('you', 'happy') not found
probability for unigram happy found

probability for trigram ('are', 'you', 'happy') estimated as 0.064


## Interpolation

In [32]:
# Precalculated probabilities
trigram_probabilities = {('i', 'am', 'happy'): 0.15}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}

In [33]:
# Weights found from optimization on validation set
lambda_1 = 0.8
lambda_2 = 0.15
lambda_3 = 0.05

In [34]:
# Trigram bigram unigram that we'd like to estimate
trigram = ('i', 'am', 'happy')
bigram, unigram = trigram[1: 3], trigram[2]

In [35]:
probability_hat_trigram = lambda_1 * trigram_probabilities[trigram] 
+ lambda_2 * bigram_probabilities[bigram]
+ lambda_3 * unigram_probabilities[unigram]

print(f"estimated probability of the input trigram {trigram} is {probability_hat_trigram: 0.4f}")

estimated probability of the input trigram ('i', 'am', 'happy') is  0.1200
