# Find the bigram probabilities of the sentence tokens

In [1]:
import nltk

In [2]:
from nltk.corpus import brown
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/raisaurabh04/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
# Corpus
words = brown.words()
words

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [4]:
words=[w.lower() for w in words]

words[0:10]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of']

In [80]:
len(words)

1161192

In [10]:
# Unigram frequency 
uni_freq = nltk.FreqDist(w.lower() for w in words)
uni_freq

FreqDist({'the': 69971, ',': 58334, '.': 49346, 'of': 36412, 'and': 28853, 'to': 26158, 'a': 23195, 'in': 21337, 'that': 10594, 'is': 10109, ...})

In [11]:
# Size of corpus
total_words = len(words)
total_words

1161192

In [12]:
print('Frequency of tokens of the sample sentence:',total_words)

Frequency of tokens of the sample sentence: 1161192


In [13]:
#Sentence 
test_sentence_tokens=['this','is','a','sunny','day','.','however','i','am','not','feeling','well','lots','of','cold']

In [14]:
for word in test_sentence_tokens:
    print(f'Frequency of \"{word}\" is {uni_freq[word]}')
print('\n\n')

Frequency of "this" is 5145
Frequency of "is" is 10109
Frequency of "a" is 23195
Frequency of "sunny" is 13
Frequency of "day" is 687
Frequency of "." is 49346
Frequency of "however" is 552
Frequency of "i" is 5164
Frequency of "am" is 237
Frequency of "not" is 4610
Frequency of "feeling" is 172
Frequency of "well" is 897
Frequency of "lots" is 42
Frequency of "of" is 36412
Frequency of "cold" is 171





In [59]:
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    #print(f"Word is {word}")
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
        bigram_words.append(word)
        #print(f"Previous, word in if block is {previous}, {word}")
    else:
        bigram_words.append(word)
        #print(f"Previous, word in else block is {previous}, {word}")
    previous = word
    #print(bigram_words)

bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

In [60]:
updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*')

Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*


In [19]:
from collections import Counter

In [20]:
freq = Counter(bigram_words)
freq['the']

69971

In [62]:
updated_uni_freq

FreqDist({'the': 69971, ',': 58334, '*start_end*': 55636, '.': 49346, 'of': 36412, 'and': 28853, 'to': 26158, 'a': 23195, 'in': 21337, 'that': 10594, ...})

In [63]:
updated_uni_freq["."]

49346

In [64]:
# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)

In [65]:
# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)
conditional_freq

<ConditionalFreqDist with 49816 conditions>

In [86]:
conditional_freq['is']['a'] / updated_uni_freq['is']

0.08576515975863093

In [81]:
updated_uni_freq['this']

5145

In [66]:
# Function to calculate bigram probability
def get_bigram_probability(first,second):
    
    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]

    #print(f"First Token is \"{first}\", previous token is \"{second}\",unigram          freq. is {unigram_freq}")
    bigram_prob = (bigram_freq)/(unigram_freq)
    return bigram_prob


In [68]:
## Calculating the bigram probability

prob_list=[]
previous = '*start_end*'

for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)
  


*start_end* this 0.0196
this is 0.0842
is a 0.0858
a sunny 4.31e-05
sunny day 0.154
day . 0.162
. however 0.0
however i 0.0
i am 0.0401
am not 0.105
not feeling 0.0
feeling well 0.0
well lots 0.0
lots of 0.714
of cold 0.000137


In [29]:
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
    else:
        bigram_words.append(previous)
    
    previous = word


    
    
bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*')


# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)


# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)



# Code begins here


# Function to calculate bigram probability
def get_bigram_probability(first,second):
    
    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]

    bigram_prob = (bigram_freq)/(unigram_freq) #without Laplacian smoothing

    
    return bigram_prob

## Calculating the bigram probability

prob_list=[]
previous = '*start_end*'

for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)


    
# For the final term    
next_probability = get_bigram_probability(previous,'*start_end*')
print(previous,'*start_end*',next_probability)
prob_list.append(next_probability)    

print(prob_list)    


Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*
*start_end* this 0.0196
this is 0.0842
is a 0.0858
a sunny 4.31e-05
sunny day 0.154
day . 0.0


ZeroDivisionError: division by zero

##Find the perplexity and total probabilities of the given sentences

In [None]:
prob_list=[0.1, 0.023 ,0.09]


perplexity=1

# Calculating N
N=len(prob_list)-1


# Calculating the perplexity
for val in prob_list:
    perplexity = perplexity * (1/val)

perplexity = pow(perplexity, 1/float(N)) 

print("Perplexity= :",perplexity)


In [87]:


"""For the sentence: 'this is a sunny day' """ 
prob_list_1=[0.008303975842979365, 0.05030826140567201, 0.08609535184632229, 4.5083630133898384e-05, 0.15384615384615385]



total_prob_1 = 1

# Multiplying all the values of the probability and storing it
for val in prob_list_1:
    total_prob_1 *= val


print("For the sentence- 'this is a sunny day'")
print("Total probability:",total_prob_1)


perplexity_1=1

# Calculating N
N=len(prob_list_1)-1


# Calculating the perplexity
for val in prob_list_1:
    perplexity_1 = perplexity_1 * (1/val)

perplexity_1 = pow(perplexity_1, 1/float(N)) 

print("Perplexity:",perplexity_1)



"""For the sentence: 'this place is beautiful' """
prob_list_2=[0.008303975842979365, 0.0022194821208384712, 0.02185792349726776, 9.953219866626854e-05]

total_prob_2 = 1

# Multiplying all the values of the probability and storing it
for val in prob_list_2:
    total_prob_2 *= val

print("\n\nFor the sentence- 'this place is beautiful'")    
print("Total probability: ",total_prob_2)


perplexity_2=1

# Calculating N
N=len(prob_list_2)-1

# Calculating perplexity
for val in prob_list_2:
    perplexity_2 = perplexity_2 * (1/val)

perplexity_2 = pow(perplexity_2, 1/float(N)) 

print("Perplexity: ",perplexity_2)



For the sentence- 'this is a sunny day'
Total probability: 2.494655687321879e-10
Perplexity: 251.6212681454414


For the sentence- 'this place is beautiful'
Total probability:  4.009684736463708e-11
Perplexity:  2921.6616783932823


In [None]:
# Naive Bayes Model:


##Calculate the probability using Laplace smoothing

In [None]:
import nltk
from nltk.corpus import brown

# Corpus
words = brown.words()
words=[w.lower() for w in words]

# Unigram frequency 
uni_freq = nltk.FreqDist(w.lower() for w in words)

# Size of corpus
total_words = len(words)

print('Frequency of tokens of the sample sentence:')

for word in test_sentence_tokens:
    print(word,uni_freq[word])

    
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
    else:
        bigram_words.append(word)
    
    previous = word


    
    
bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('\nCalculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')


# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)


# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)

#Sentence 
test_sentence_tokens=['sunset','looks','magnificient','.']

# Code begins here



V=len(set(words))


# Function to calculate bigram probability
def get_bigram_probability(first,second):
    
    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]

    bigram_prob = (bigram_freq + 1)/(unigram_freq + V) # with Laplacian Smoothing
    
    return bigram_prob

# Calculating the bigram probability

prob_list=[]
previous = '*start_end*'
for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)

    
# For the final term    
next_probability = get_bigram_probability(previous,'*start_end*')
print(previous,'*start_end*',next_probability)
prob_list.append(next_probability)    

print(prob_list)    



# Calculating the total probability

total_prob = 1
for val in prob_list:
    total_prob *= val

print("\nTotal probability:",total_prob)

##Calculate the probability using Backoff method

In [None]:

import nltk
from nltk.corpus import brown

#Sentence 
test_sentence_tokens=['this','is','a','very','sunny','day','.']


# Corpus
words = brown.words()
words=[w.lower() for w in words]

# Unigram frequency 
uni_freq = nltk.FreqDist(w.lower() for w in words)

# Size of corpus
total_words = len(words)

print('Frequency of tokens of the sample sentence:')

for word in test_sentence_tokens:
    print(word,uni_freq[word])

    
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
    else:
        bigram_words.append(word)
    
    previous = word


    
    
bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('\nCalculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')


# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)


# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)


# Code begins here


V=len(set(words))


# Function to calculate bigram probability
def get_bigram_probability(first,second):

    if not second in conditional_freq[first]:
        print('Backing Off to Unigram Probability for',second)
        unigram_prob = updated_uni_freq[second]/len(words)
        return unigram_prob 
    

    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]
    bigram_prob = bigram_freq/unigram_freq
    
    return bigram_prob


# Calculating the bigram probability

prob_list=[]
previous = '*start_end*'
for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)

    
# For the final term    
next_probability = get_bigram_probability(previous,'*start_end*')
print(previous,'*start_end*',next_probability)
prob_list.append(next_probability)    

print(prob_list)    



# Calculating the total probability

total_prob = 1
for val in prob_list:
    total_prob *= val

print("\nTotal probability:",total_prob)