### A simple ngram language model

In [1]:
import utils
from collections import defaultdict

In [2]:
fu = utils.FileUtil()

In [3]:
def get_ngrams(sentence, n, pad_left=False, pad_right=False):
    """
    return n-grams with or w/o padding
    """
    words = fu.get_words(sentence)
    for i in range(n-1):
        if pad_left:
            words.insert(0, None)
        if pad_right:
            words.append(None)
    result = [words[i:i+n] for i, word in enumerate(words) if i+n <= len(words)]

    return result

In [4]:
def build_trigram_model(model, sentences):
    for sentence in sentences:
        trigram = get_ngrams(sentence, 3, pad_left=True, pad_right=True)

        for w1, w2, w3 in trigram:
            model[(w1, w2)][w3] += 1

    for w1_w2, w3_cnt in model.items():
        total_cnt = sum(w3_cnt.values())
        for w3, cnt in w3_cnt.items():
            w3_cnt[w3] /= total_cnt

    return model

In [5]:
def get_next_word(w1, w2, model):
    """
    return next word with max. freq
    """
    next_word = max(model[(w1, w2)].items(), key=lambda x: x[1])[0]
    return next_word

In [6]:
def generate_sentence(model, w1 = None, w2 = None):
    """
    return a sentence starts with w1 and w2
    """
    sentence_end = False
    sentence = [w1, w2]

    while not sentence_end:
        word = get_next_word(w1, w2, model)
        sentence.append(word)

        w1 = w2
        w2 = word

        if sentence[-2:] == [None, None]:
            sentence_end = True

    sentence = list(filter(lambda x: x is not None, sentence))
    return ' '.join(sentence)

In [7]:
lines = fu.read('data/big_story.txt')
sentences = fu.get_sentences(lines)

In [8]:
trigram_model = build_trigram_model(defaultdict(lambda: defaultdict(lambda: 0)), sentences)
sentence = generate_sentence(trigram_model, None, None)
sentence

'I have no doubt that the doctor was furnished with long windows almost to the other side'

In [13]:
sentence_2 = generate_sentence(trigram_model, 'You', 'will')
sentence_2

'You will excuse my saying so somewhat to embellish so many of my own'