In [1]:
import nltk
from nltk.corpus import brown

test_sentence_tokens = ['a','fact','about','the','unicorn','is','the','same','as','an','alternative','fact','about','the','unicorn','.']

words = brown.words()
fdist1 = nltk.FreqDist(w.lower() for w in words)

total_words = len(words)

In [2]:
fdist1

FreqDist({'the': 69971, ',': 58334, '.': 49346, 'of': 36412, 'and': 28853, 'to': 26158, 'a': 23195, 'in': 21337, 'that': 10594, 'is': 10109, ...})

In [3]:
print('Frequency of tokens in sample sententence in Brown according to NLTK:')

for word in test_sentence_tokens:
    print(word,fdist1[word])


Frequency of tokens in sample sententence in Brown according to NLTK:
a 23195
fact 447
about 1815
the 69971
unicorn 0
is 10109
the 69971
same 686
as 7253
an 3740
alternative 34
fact 447
about 1815
the 69971
unicorn 0
. 49346


In [4]:
# input('Pausing: Hit Return when Ready.')

print('Given that there are',total_words,'in the Brown Corpus, the unigram probability of these words')
print('is as follows (rounded to 3 significant digits):')

for word in test_sentence_tokens:
    unigram_probability = fdist1[word]/total_words
    print(word,float('%.3g' % unigram_probability))
    ## print(word,round((fdist1[word]/total_words),3))
    

Given that there are 1161192 in the Brown Corpus, the unigram probability of these words
is as follows (rounded to 3 significant digits):
a 0.02
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0
is 0.00871
the 0.0603
same 0.000591
as 0.00625
an 0.00322
alternative 2.93e-05
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0
. 0.0425


In [5]:
words2 = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        ## and after end-of-sentence markers (overgenerate due to abbreviations, etc.)
        words2.append('*start_end*')
    if fdist1[word]==1:
        ## words occurring only once are treated as Out of Vocabulary Words
        words2.append('*oov*')
    else:
        words2.append(word)
    previous = word

In [6]:
words2.append('*start_end*')

In [7]:
fdist2 = nltk.FreqDist(w.lower() for w in words2)

In [8]:
print('There are',fdist2['*oov*'],'instances of OOVs')

print('Unigram probabilities including OOV probabilities.')

There are 15673 instances of OOVs
Unigram probabilities including OOV probabilities.


In [9]:
def get_unigram_probability(word):
    if word in fdist1:
        unigram_probability = fdist2[word]/total_words
    else:
        unigram_probability = fdist2['*oov*']/total_words
    return(unigram_probability)

In [10]:
for word in test_sentence_tokens:
    unigram_probability = get_unigram_probability(word)
    print(word,float('%.3g' % unigram_probability))

# input('Pausing: Hit Return when Ready.')
## make new version that models Out of Vocabulary (OOV) words

print('Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')
print('Assuming some idealizations: all periods, questions and exclamation marks end sentences;')

a 0.02
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0135
is 0.00871
the 0.0603
same 0.000591
as 0.00625
an 0.00322
alternative 2.93e-05
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0135
. 0.0425
Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
Assuming some idealizations: all periods, questions and exclamation marks end sentences;


In [11]:
print('Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')
print('Assuming some idealizations: all periods, questions and exclamation marks end sentences;')

Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
Assuming some idealizations: all periods, questions and exclamation marks end sentences;


<generator object bigrams at 0x000001E6EE108B10>

In [26]:
print('Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')
print('Assuming some idealizations: all periods, questions and exclamation marks end sentences;')

bigrams = nltk.bigrams(w.lower() for w in words2)
## get bigrams for words2 (words plus OOV)
# print(*map(' '.join, bigrams), sep=', ')
cfd = nltk.ConditionalFreqDist(bigrams)

Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
Assuming some idealizations: all periods, questions and exclamation marks end sentences;


<ConditionalFreqDist with 34144 conditions>

In [None]:
for token1 in cfd:
    if not '*oov*' in cfd[token1]:
        cfd[token1]['*oov*']=1
    for a in cfd[token1]:
        print(a)
    print("Iteration Done")

In [15]:
def multiply_list(inlist):
    out = 1
    for number in inlist:
        out *= number
    return(out)

In [16]:
def get_bigram_probability(first,second):
    if not second in cfd[first]:
        print('Backing Off to Unigram Probability for',second)
        unigram_probability = get_unigram_probability(second)
        return(unigram_probability)
    else:
        bigram_frequency = cfd[first][second]
    unigram_frequency = fdist2[first]
    bigram_probability = bigram_frequency/unigram_frequency
    return(bigram_probability)

In [17]:
def calculate_bigram_freq_of_sentence_token_list(tokens):
    prob_list = []
    ## assume that 'START' precedes the first token
    previous = '*start_end*'
    for token in tokens:
        if not token in fdist2:
            token = '*oov*'
        next_probability = get_bigram_probability(previous,token)
        print(previous,token,(float('%.3g' % next_probability)))
        prob_list.append(next_probability)
        previous = token
    ## assume that 'END' follows the last token
    next_probability = get_bigram_probability(previous,'*start_end*')
    print(previous,'*start_end*',next_probability)
    prob_list.append(next_probability)
    probability = multiply_list(prob_list)
    print('Total Probability',float('%.3g' % probability))
    return(probability)

In [18]:
result = calculate_bigram_freq_of_sentence_token_list(test_sentence_tokens)

*start_end* a 0.0182
a fact 0.000388
fact about 0.00447
about the 0.182
the *oov* 0.0293
*oov* is 0.00485
is the 0.0786
the same 0.00898
same as 0.035
as an 0.029
an alternative 0.00241
Backing Off to Unigram Probability for fact
alternative fact 0.000385
fact about 0.00447
about the 0.182
the *oov* 0.0293
*oov* . 0.0865
. *start_end* 1.0
Total Probability 1.12e-30
