## Apply various concepts for language generalization on out of vocabulary words

### Filter vocabulary: Select words with M most common words

In [1]:
#Build a vocabulary with M most common words 
from collections import Counter

#Set M
M = 4

#Build a sample word counts
word_counts = {'happy': 5, 'because': 3, 'I': 2, 'am': 2, 'learning': 3, '.': 1}

#Create vocab with condition of most common
vocab = Counter(word_counts).most_common(M)
print(vocab)

#Get only words, removing fequency
vocab = [word[0] for word in vocab]
print(vocab)

### Replace the unknown words in a sentence with unknown token

In [2]:
#Create a sentence
sentence = "I am learning NLP"

#Convert in list
sentence = sentence.split()
print(sentence)

#Iterate and create output sentence with unknown word replaced with unknown tag
unknown_tag = '<UNK>'

output_sentence = [word if word in vocab else unknown_tag for word in sentence]
print(output_sentence)

In [3]:
#There will be problem with too many unknown words and result in high perplexity
# many <unk> low perplexity 
training_set = ['i', 'am', 'happy', 'because','i', 'am', 'learning', '.']
training_set_unk = ['i', 'am', '<UNK>', '<UNK>','i', 'am', '<UNK>', '<UNK>']

test_set = ['i', 'am', 'learning']
test_set_unk = ['i', 'am', '<UNK>']

M = len(test_set)

prob = 1
prob_unk = 1

# pre-calculated probabilities
bigram_probabilities = {('i', 'am'): 1.0, ('am', 'happy'): 0.5, ('happy', 'because'): 1.0, ('because', 'i'): 1.0, ('am', 'learning'): 0.5, ('learning', '.'): 1.0}
bigram_probabilities_unk = {('i', 'am'): 1.0, ('am', '<UNK>'): 1.0, ('<UNK>', '<UNK>'): 0.5, ('<UNK>', 'i'): 0.25}

#Go through the list and calculate bigram probabilities
for i in range(len(test_set) - 2 + 1):
    
    bigram = tuple(test_set[i: i + 2])
    prob = prob * bigram_probabilities[bigram]
    
    bigram_unk = tuple(test_set_unk[i: i + 2])
    prob_unk = prob_unk * bigram_probabilities_unk[bigram_unk]
    
    
# calculate perplexity for both original test set and test set with <UNK>
perplexity = prob ** (-1 / M)
perplexity_unk = prob_unk ** (-1 / M)

print(f"perplexity for the training set: {perplexity}")
print(f"perplexity for the training set with <UNK>: {perplexity_unk}")


### Add k-smoothing

In [4]:
#Define a function to have n-gram probability with k-smoothing

def add_k_smoothing(k, vocab_size, n_gram_count, n_gram_prefix_count):
    
    numerator = n_gram_count + k
    denominator = n_gram_prefix_count + k * vocab_size
    return numerator / denominator

trigram_probabilities = {('i', 'am', 'happy') : 2}
bigram_probabilities = {( 'am', 'happy') : 10}
vocabulary_size = 5
k = 1

probability_known_trigram = add_k_smoothing(k, vocabulary_size, trigram_probabilities[('i', 'am', 'happy')], 
                           bigram_probabilities[( 'am', 'happy')])

probability_unknown_trigram = add_k_smoothing(k, vocabulary_size, 0, 0)

print(f"probability_known_trigram: {probability_known_trigram}")
print(f"probability_unknown_trigram: {probability_unknown_trigram}")


### Thus for larger corpus, the probability for known and unknown will be very same to each other. So we use Back-off which levarages the missing info about high-order by multiplying lambda_factor with lower order

In [5]:
# pre-calculated probabilities of all types of n-grams
trigram_probabilities = {('i', 'am', 'happy'): 0}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}

# this is the input trigram we need to estimate
trigram = ('are', 'you', 'happy')

# find the last bigram and unigram of the input
bigram = trigram[1: 3]
unigram = trigram[2]
print(f"besides the trigram {trigram} we also use bigram {bigram} and unigram ({unigram})\n")

# 0.4 is used as an example, experimentally found for web-scale corpuses when using the "stupid" back-off
lambda_factor = 0.4
probability_hat_trigram = 0

# search for first non-zero probability starting with trigram
# to generalize this for any order of n-gram hierarchy, 
# you could loop through the probability dictionaries instead of if/else cascade
if trigram not in trigram_probabilities or trigram_probabilities[trigram] == 0:
    print(f"probability for trigram {trigram} not found")
    
    if bigram not in bigram_probabilities or bigram_probabilities[bigram] == 0:
        print(f"probability for bigram {bigram} not found")
        
        if unigram in unigram_probabilities:
            print(f"probability for unigram {unigram} found\n")
            probability_hat_trigram = lambda_factor * lambda_factor * unigram_probabilities[unigram]
        else:
            probability_hat_trigram = 0
    else:
        probability_hat_trigram = lambda_factor * bigram_probabilities[bigram]
else:
    probability_hat_trigram = trigram_probabilities[trigram]

print(f"probability for trigram {trigram} estimated as {probability_hat_trigram}")

besides the trigram ('are', 'you', 'happy') we also use bigram ('you', 'happy') and unigram (happy)

probability for trigram ('are', 'you', 'happy') not found
probability for bigram ('you', 'happy') not found
probability for unigram happy found

probability for trigram ('are', 'you', 'happy') estimated as 0.06400000000000002


### The other method is interpolation, which considers all orders probability rather than highest order, in case of missing value in vocabulory

In [6]:
# pre-calculated probabilities of all types of n-grams
trigram_probabilities = {('i', 'am', 'happy'): 0.15}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}

# the weights come from optimization on a validation set
lambda_1 = 0.8
lambda_2 = 0.15
lambda_3 = 0.05

# this is the input trigram we need to estimate
trigram = ('i', 'am', 'happy')

# find the last bigram and unigram of the input
bigram = trigram[1: 3]
unigram = trigram[2]

# in the production code, you would need to check if the probability n-gram dictionary contains the n-gram
probability_hat_trigram = lambda_1 * trigram_probabilities[trigram] 
+ lambda_2 * bigram_probabilities[bigram]
+ lambda_3 * unigram_probabilities[unigram]

print(f"Estimated probability of the input trigram {trigram} is {probability_hat_trigram}")

Estimated probability of the input trigram ('i', 'am', 'happy') is 0.12
