In [None]:
import nltk
from nltk import bigrams, trigrams, WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.corpus import gutenberg

text = "This is a sample sentence to tokenize."

# tokenize by whitespace
tokenizer = WhitespaceTokenizer()
tokenizer.tokenize(text)

In [None]:
# download some text
nltk.download('gutenberg')
nltk.download('punkt')
corpus = gutenberg.raw('shakespeare-hamlet.txt') # use some shakespeare text
corpus[0:200]

In [None]:
corpus

In [None]:
# tokenize the lowered corpus
raw_tokens = tokenizer.tokenize(corpus.lower())

In [None]:
# manually add EOS to the tokens
tokens = []
for token in raw_tokens:
    tokens.append(token)
    if token[-1] == '.': # add EOS after each token that ends with a full stop
        tokens.append('<EOS>')

In [None]:
# create unigrams, bigrams and trigrams
unigram_model = FreqDist(tokens)
bigram_model = FreqDist(bigrams(tokens))
trigram_model = FreqDist(trigrams(tokens))

In [None]:
unigram_model

In [None]:
bigram_model

In [None]:
trigram_model

In [None]:
bigram_model['my','lord']

In [None]:
#useful functions
def unigram_probability(word):
    return unigram_model.freq(word) # frequency

def bigram_probability(prev_word, word):
    if (prev_word == '<EOS>') or (prev_word not in unigram_model):
        return 0
    return bigram_model[prev_word,word] / unigram_model[prev_word] # relative frequency

def trigram_probability(prev_word1, prev_word2, word):
    if (prev_word2 == '<EOS>') or ((prev_word1, prev_word2) not in bigram_model):
        return 0
    return trigram_model[prev_word1, prev_word2, word] / bigram_model[(prev_word1, prev_word2)] # relative frequency

def string_probability(string, tokenizer): # with the bigrams
    tokens = tokenizer.tokenize(string)
    prob = 1.0
    for i in range(len(tokens) - 1):
        prob *= bigram_probability(tokens[i], tokens[i+1])
    return prob

def generate_text(starting_word, length=5): # with the bigrams
    generated_text = [starting_word]
    current_word = starting_word

    for _ in range(length - 1):
        next_word = max(unigram_model, key = lambda word: bigram_probability(current_word, word))
        generated_text.append(next_word)
        current_word = next_word

    return ' '.join(generated_text)

In [None]:
# examples
print("Unigram Probability of 'lord':", unigram_probability('lord'))
print("Bigram Probability of 'my lord':", bigram_probability('my', 'lord'))
print("Trigram Probability of 'good my lord':", trigram_probability('good', 'my', 'lord'))

In [None]:
input_string = "you are the king"
print("Bigram Probability of '{}': {}".format(input_string, string_probability(input_string, tokenizer)))
input_string = "you are the king."
print("Bigram Probability of '{}': {}".format(input_string, string_probability(input_string, tokenizer)))

In [None]:
print("Generated Text:", generate_text('you', length=6))