# Finding Probability of Unigram, Bigram and Trigrams from Book

In [51]:
import numpy as np
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.tokenize import wordpunct_tokenize
import string

In [52]:
mytext = open('alicesadventuresinwonderland.txt').read()
mytext[:300]

"Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online "

In [53]:
len(mytext)

163793

In [54]:
mytext = mytext.replace('\n', ' ')
mytext = mytext.replace('  ', ' ')
mytext = mytext.lower()
len(mytext)

162143

In [55]:
# Removing Punctuation
for i in list(string.punctuation):
    if i in mytext:
        mytext = mytext.replace(i, '')
len(mytext)

153227

In [56]:
mytoken = word_tokenize(mytext)
len(mytoken)

29389

In [57]:
one_gram = list(ngrams(mytoken, 1, 
    pad_left=True,
    pad_right=True,
    left_pad_symbol='_',
    right_pad_symbol='_'))

count_one = Counter(one_gram)
one_dict =  {}
for k, v in count_one.items():
    one_dict[' '.join(k)] = v
print(Counter(one_dict))



In [58]:
two_gram = list(ngrams(mytoken, 2, 
    pad_left=True,
    pad_right=True,
    left_pad_symbol='_',
    right_pad_symbol='_'))

count_two = Counter(two_gram)
two_dict =  {}
for k, v in count_two.items():
    two_dict[' '.join(k)] = v
print(Counter(two_dict))



In [62]:
vocabulary_size = len(one_dict)
vocabulary_size

3248

In [59]:
three_gram = list(ngrams(mytoken, 3, 
    pad_left=True,
    pad_right=True,
    left_pad_symbol='_',
    right_pad_symbol='_'))

count_three = Counter(three_gram)
three_dict =  {}
for k, v in count_three.items():
    three_dict[' '.join(k)] = v
print(Counter(three_dict))



### UniGram Probability

In [93]:
def get_unigram_probability(word):
    unigram_probability = one_dict[word]/vocabulary_size
    return unigram_probability

# print(word,float('%.3g' % unigram_probability))
    
## print(word,round((fdist1[word]/total_words),3))
get_unigram_probability('the')

0.5554187192118226

### Bigram Probability

In [94]:
# Get probability of given bigram belonging to the language which bigram_dict is in
def get_bigram_probability(bigram, first_word): 
    # first_word is the first word of the word bigram.
    bigram_count = two_dict.get(bigram)
    if bigram_count is None:
        bigram_count = 0
    
    onegram_count = one_dict.get(first_word)
    if onegram_count is None:
        onegram_count = 0
    
    return (bigram_count + 1) / (onegram_count + 1)# + vocabulary_size) 

In [95]:
get_bigram_probability('about her', 'about')

0.07766990291262135

In [96]:
get_bigram_probability('she was', 'she')

0.1038961038961039

In [97]:
get_bigram_probability('said alice', 'said')

0.2505399568034557

In [98]:
# To get the logic of this formula, note how the proability is used in the function below. 
# Without the + 1 in the Nr, if you find a bigram which is not in our known bigrams for a language,
# the probability of it being in that language would become 0. So we would like to assign a small probability of 
# 1 / vocabulary_size in that case. 

### Trigram Probability

In [103]:
# Get probability of given bigram belonging to the language which bigram_dict is in
def get_trigram_probability(trigram, bigram): 
    # first_word is the first word of the word bigram.
    trigram_count = three_dict.get(trigram)
    if trigram_count is None:
        trigram_count = 0
    
    bigram_count = two_dict.get(bigram)
    if bigram_count is None:
        bigram_count = 0
    
    return (trigram_count + 1) / (bigram_count + 1 )# + vocabulary_size) 

get_trigram_probability('she was there', 'she was')

0.017857142857142856

In [104]:
get_trigram_probability('she died there', 'she died')

1.0