# Imports

In [1]:
import time
import json
import math
import pickle
import pathlib
import itertools
import functools
import collections
import unicodedata

In [2]:
import nltk

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# Constants

In [4]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'
CORPUS_ROOT = DATA_DIR / 'sample'

# PickledCorpusReader

In [5]:
DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'

In [6]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

class PickledCorpusReader(CategorizedCorpusReader, CorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def _resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids
    
    def feeds(self):
        data = self.open('feeds.json')
        return json.load(data)

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self._resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def tagged_paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for tagged_para in doc:
                yield tagged_para
    
    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of tokens.
        """
        for tagged_para in self.tagged_paras(fileids, categories):
            yield [[word for word, tag in tagged_sent]
                   for tagged_sent in tagged_para]

    def tagged_sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for tagged_para in self.tagged_paras(fileids, categories):
            for tagged_sent in tagged_para:
                yield tagged_sent
                
    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        tokens.
        """
        for tagged_sent in self.tagged_sents(fileids, categories):
            yield [word for word, tag in tagged_sent]
    
    def tagged_words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for sent in self.tagged_sents(fileids, categories):
            for token, tag in sent:
                yield token, tag

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of tokens.
        """
        for word, tag in self.tagged_words(fileids, categories):
            yield word
    
    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()
        
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1
            
            for sent in para:
                counts['sents'] += 1
                
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1
        
        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': counts['words'] / len(tokens),
            'ppdoc':  counts['paras'] / n_fileids,
            'sppar':  counts['sents'] / counts['paras'],
            'secs':   time.time() - started,
        }

In [7]:
corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())
words = collections.Counter(corpus.words())

print(f"{len(words.keys()):,} vocabulary {sum(words.values()):,} word count")

58,748 vocabulary 1,624,862 word count


# Grammar-Based Feature Extraction

## Context-Free Grammars

In [8]:
GRAMMAR = """
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen' | 'George'
    V -> 'looks' | 'burns'
    P -> 'in' | 'for'
    DT -> 'the'
    N -> 'castle' | 'ocean'
"""

In [9]:
cfg = nltk.CFG.fromstring(GRAMMAR)

In [10]:
print(cfg)

Grammar with 13 productions (start state = S)
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen'
    NNP -> 'George'
    V -> 'looks'
    V -> 'burns'
    P -> 'in'
    P -> 'for'
    DT -> 'the'
    N -> 'castle'
    N -> 'ocean'


In [11]:
cfg.start()

S

In [12]:
cfg.productions()

[S -> NNP VP,
 VP -> V PP,
 PP -> P NP,
 NP -> DT N,
 NNP -> 'Gwen',
 NNP -> 'George',
 V -> 'looks',
 V -> 'burns',
 P -> 'in',
 P -> 'for',
 DT -> 'the',
 N -> 'castle',
 N -> 'ocean']

## Syntactic Parsers

In [13]:
from nltk.chunk.regexp import RegexpParser

In [14]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
chunker = RegexpParser(GRAMMAR)

In [15]:
sent = """
    Dusty Baker proposed a simple solution to the Washington Nationals early-season bullpen 
    troubles Monday afternoon and it had nothing to do with his maligned group of relievers.
"""
tokens = nltk.word_tokenize(sent)
tagged = nltk.pos_tag(tokens)
chunked = chunker.parse(tagged)

In [16]:
print(chunked)

(S
  (KT Dusty/NNP Baker/NNP)
  proposed/VBD
  a/DT
  (KT simple/JJ solution/NN)
  to/TO
  the/DT
  (KT Washington/NNP Nationals/NNP)
  (KT
    early-season/JJ
    bullpen/NN
    troubles/NNS
    Monday/NNP
    afternoon/NN)
  and/CC
  it/PRP
  had/VBD
  (KT nothing/NN)
  to/TO
  do/VB
  with/IN
  his/PRP$
  maligned/VBN
  (KT group/NN of/IN relievers/NNS)
  ./.)


## Extracting Keyphrases

In [17]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS = frozenset(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])

In [18]:
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = nltk.RegexpParser(self.grammar)
    
    def normalize(self, sentence):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        return [(token.lower(), tag)
                for (token, tag) in sentence
                if not all(unicodedata.category(char).startswith('P')
                           for char in token)]
    
    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for paragraph in document:
            for sentence in paragraph:
                sentence = self.normalize(sentence)
                if not sentence:
                    continue
                chunks = nltk.tree2conlltags(
                    self.chunker.parse(sentence)
                )
                keyphrases = [
                    ' '.join(word for word, pos, chunk in group)
                    for key, group in itertools.groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for keyphrase in keyphrases:
                    yield keyphrase
    
    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))

In [19]:
%%time
corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())
docs = corpus.docs()

keyphrase_extractor = KeyphraseExtractor()
keyphrases = list(keyphrase_extractor.fit_transform(docs))
print(keyphrases[0])

['lonely city', 'heart piercing wisdom', 'loneliness', 'laing', 'everyone', 'feast later', 'point', 'own hermetic existence in new york', 'danger', 'thankfully', 'lonely city', 'cry for connection', 'overcrowded overstimulated world', 'blueprint of urban loneliness', 'emotion', 'calls', 'city', 'npr jason heller', 'olivia laing', 'lonely city', 'exploration of loneliness', 'others experiences in new york city', 'rumpus', 'review', 'lonely city', 'related posts']
CPU times: user 4.43 s, sys: 333 ms, total: 4.77 s
Wall time: 7.33 s


## Extracting Entities

In [20]:
GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'])

In [21]:
class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, labels=GOODLABELS, **kwargs):
        self.labels = labels

    def get_entities(self, document):
        return [
            ' '.join(word.lower() for word, tag in tree)
            for paragraph in document
            for sentence in paragraph
            for tree in nltk.ne_chunk(sentence)
            if hasattr(tree, 'label')
            and tree.label() in self.labels
        ]

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.get_entities(document)

In [22]:
%%time
corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())
docs = corpus.docs()

entity_extractor = EntityExtractor()
entities = list(entity_extractor.fit_transform(docs))
print(entities[0])

['lonely city', 'loneliness', 'laing', 'new york', 'lonely city', 'npr', 'jason heller', 'olivia laing', 'lonely city', 'new york city', 'rumpus', 'lonely city', 'related']
CPU times: user 2min 22s, sys: 451 ms, total: 2min 23s
Wall time: 2min 27s


# n-Gram Feature Extraction

In [23]:
def ngrams(words, n=2):
    for idx in range(len(words)-n+1):
        yield tuple(words[idx:idx+n])

In [24]:
words = [
    "The", "reporters", "listened", "closely", "as", "the", "President",
    "of", "the", "United", "States", "addressed", "the", "room", ".",
]

In [25]:
for ngram in ngrams(words, n=3):
    print(ngram)

('The', 'reporters', 'listened')
('reporters', 'listened', 'closely')
('listened', 'closely', 'as')
('closely', 'as', 'the')
('as', 'the', 'President')
('the', 'President', 'of')
('President', 'of', 'the')
('of', 'the', 'United')
('the', 'United', 'States')
('United', 'States', 'addressed')
('States', 'addressed', 'the')
('addressed', 'the', 'room')
('the', 'room', '.')


## An n-Gram-Aware CorpusReader

In [26]:
LPAD_SYMBOL = '<s>'
RPAD_SYMBOL = '</s>'

In [27]:
nltk_ngrams = functools.partial(
    nltk.ngrams,
    pad_left=True, left_pad_symbol=LPAD_SYMBOL,
    pad_right=True, right_pad_symbol=RPAD_SYMBOL
)

In [28]:
class NgramPickledCorpusReader(PickledCorpusReader):
    
    def tagged_ngrams(self, n=2, fileids=None, categories=None):
        for sent in self.tagged_sents(fileids, categories):
            for ngram in nltk_ngrams(sent, n):
                yield ngram
    
    def ngrams(self, n=2, fileids=None, categories=None):
        for sent in self.sents(fileids, categories):
            for ngram in nltk_ngrams(sent, n):
                yield ngram

In [29]:
%%time
corpus = NgramPickledCorpusReader(root=CORPUS_ROOT.as_posix())
ngrams = corpus.ngrams(n=4)

for ngram in list(ngrams)[:20]:
    print(ngram)

('<s>', '<s>', '<s>', 'The')
('<s>', '<s>', 'The', 'Lonely')
('<s>', 'The', 'Lonely', 'City')
('The', 'Lonely', 'City', 'bristles')
('Lonely', 'City', 'bristles', 'with')
('City', 'bristles', 'with', 'heart')
('bristles', 'with', 'heart', '-')
('with', 'heart', '-', 'piercing')
('heart', '-', 'piercing', 'wisdom')
('-', 'piercing', 'wisdom', '.')
('piercing', 'wisdom', '.', '</s>')
('wisdom', '.', '</s>', '</s>')
('.', '</s>', '</s>', '</s>')
('<s>', '<s>', '<s>', 'Loneliness')
('<s>', '<s>', 'Loneliness', ',')
('<s>', 'Loneliness', ',', 'according')
('Loneliness', ',', 'according', 'to')
(',', 'according', 'to', 'Laing')
('according', 'to', 'Laing', ',')
('to', 'Laing', ',', 'feels')
CPU times: user 993 ms, sys: 535 ms, total: 1.53 s
Wall time: 2min 21s


## Choosing the Right n-Gram Window

## Significant Collocations

In [30]:
import pandas as pd

In [31]:
from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics.association import QuadgramAssocMeasures

In [32]:
def rank_quadgrams(words, metric):
    """
    Find and rank quadgrams from the supplied words using the given
    association metric.
    """

    # Create a collocation ranking utility from corpus words.
    ngrams = QuadgramCollocationFinder.from_words(words)

    # Rank collocations by an association metric
    scored_df = pd.DataFrame(
        data=ngrams.score_ngrams(metric),
        columns=['collocation',
                 f'score ({metric.__name__})']
    )
    
    return scored_df

In [33]:
%%time
corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())

scored_df = rank_quadgrams(
    words=corpus.words(),
    metric=QuadgramAssocMeasures.likelihood_ratio
)

CPU times: user 3min 1s, sys: 2.09 s, total: 3min 3s
Wall time: 3min 6s


In [34]:
%%time
# Group quadgrams by first word
scored_df['first_word'] = scored_df['collocation'].str[0]

# Sort keyed quadgrams by strongest association
scored_df.sort_values(
    by=['first_word', 'score (likelihood_ratio)'],
    ascending=False,
    inplace=True
)

CPU times: user 1.6 s, sys: 29.1 ms, total: 1.62 s
Wall time: 1.63 s


In [35]:
class SignificantCollocations(BaseEstimator, TransformerMixin):
    
    def __init__(self,
                 ngram_class=QuadgramCollocationFinder,
                 metric=QuadgramAssocMeasures.pmi):
        self.ngram_class = ngram_class
        self.metric = metric
        
    def fit(self, docs, target):
        ngrams = self.ngram_class.from_documents(docs)
        self.scored_ = dict(ngrams.score_ngrams(self.metric))
    
    def transform(self, docs):
        for doc in docs:
            ngrams = self.ngram_class.from_words(doc)
            yield {
                ngram: self.scored_.get(ngram, 0.0)
                for ngram in ngrams.nbest(QuadgramAssocMeasures.raw_freq, 50)
            }

In [36]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
model = Pipeline([
    ('union', FeatureUnion([
        ('ngrams', Pipeline([
            ('sigcol', SignificantCollocations()),
            ('dsigcol', DictVectorizer()),
        ])),
        ('tfidf', TfidfVectorizer()),
    ])),
    ('clf', SGDClassifier()),
])

# n-Gram Language Models

## Frequency and Conditional Frequency

In [8]:
UNKNOWN = '<UNK>'
LPAD = '<s>'
RPAD = '</s>'

In [9]:
class NgramCounter(object):
    """
    The NgramCounter class counts ngrams given a vocabulary and ngram size.
    """

    def __init__(self, n, vocabulary, unknown=UNKNOWN):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError('ngram size must be greater than or equal to 1')

        self.n = n
        self.unknown = unknown
        self.padding = {
            'pad_left': True,
            'pad_right': True,
            'left_pad_symbol': LPAD,
            'right_pad_symbol': RPAD
        }

        self.vocabulary = vocabulary
        self.allgrams = collections.defaultdict(nltk.ConditionalFreqDist)
        self.ngrams = nltk.FreqDist()
        self.unigrams = nltk.FreqDist()
    
    def train_counts(self, training_text):
        for sent in training_text:
            checked_sent = map(self.check_against_vocab, sent)
            sent_start = True
            for ngram in self.to_ngrams(checked_sent):
                # ngrams
                self.ngrams[ngram] += 1
                # unigrams
                context, word = tuple(ngram[:-1]), ngram[-1]
                if sent_start:
                    for context_word in context:
                        self.unigrams[context_word] += 1
                    sent_start = False
                self.unigrams[word] += 1
                # allgrams
                for window, ngram_order in enumerate(range(self.n, 1, -1)):
                    context = context[window:]
                    self.allgrams[ngram_order][context][word] += 1

    def check_against_vocab(self, word):
        return word if word in self.vocabulary else self.unknown

    def to_ngrams(self, sequence):
        """
        Wrapper for NLTK ngrams method
        """
        return nltk.ngrams(sequence, self.n, **self.padding)

In [10]:
def count_ngrams(n, vocabulary, sentences):
    counter = NgramCounter(n, vocabulary)
    counter.train_counts(sentences)
    return counter

In [11]:
corpus = PickledCorpusReader(root=CORPUS_ROOT.as_posix())
vocabulary = collections.Counter(corpus.words())
sentences = list(corpus.sents())

In [12]:
%%time
counter = count_ngrams(3, vocabulary, sentences)

CPU times: user 7.06 s, sys: 123 ms, total: 7.18 s
Wall time: 7.18 s


In [13]:
counter.unigrams

FreqDist({'<s>': 149798, '</s>': 149798, ',': 68835, 'the': 65829, '.': 65400, 'to': 36590, 'of': 32149, 'and': 32017, 'a': 31716, 'in': 25062, ...})

In [14]:
counter.ngrams

FreqDist({('.', '</s>', '</s>'): 56288, ('<s>', '<s>', 'The'): 7441, ('<s>', '<s>', '"'): 3643, ('<s>', '<s>', '“'): 2699, ('."', '</s>', '</s>'): 2304, ('<s>', '<s>', 'It'): 2093, ('<s>', '<s>', 'But'): 1980, ('?', '</s>', '</s>'): 1951, ('<s>', '<s>', 'In'): 1936, ('said', '.', '</s>'): 1816, ...})

In [15]:
counter.allgrams[3]

<ConditionalFreqDist with 574320 conditions>

In [16]:
sorted(counter.allgrams[3].conditions())[:10]

[('!', '</s>'),
 ('!', 'For'),
 ('!', 'Kaley'),
 ('!', 'Lovebirds'),
 ('!', 'S'),
 ('!', 'Will'),
 ('!', 'duan'),
 ('!', 'wuick'),
 ('!', 'Â'),
 ('!!', '</s>')]

In [17]:
counter.allgrams[3][('the', 'President')]

FreqDist({"'": 3, 'Source': 1, 'and': 1, 'nominates': 1, 'in': 1, 'as': 1, 'said': 1, 'is': 1, 'who': 1, 'that': 1, ...})

## Estimating Maximum Likelihood

In [18]:
class BaseNgramModel(object):
    """
    The BaseNgramModel creates an n-gram language model.
    This base model is equivalent to a Maximum Likelihood Estimation.
    """

    def __init__(self, ngram_counter):
        """
        BaseNgramModel is initialized with an NgramCounter.
        """
        self.ngram_counter = ngram_counter
        
    def check_context(self, context):
        """
        Ensures that the context is not longer than or equal to the model's
        n-gram order.

        Returns the context as a tuple.
        """
        if len(context) >= self.ngram_counter.n:
            raise ValueError("Context too long for this n-gram")

        return tuple(context)

    def score(self, word, context):
        """
        For a given string representation of a word, and a string word context,
        returns the maximum likelihood score that the word will follow the
        context.
        """
        context = self.check_context(context)
        return (self.ngram_counter
                .allgrams[self.ngram_counter.n][context]
                .freq(word))
    
    def logscore(self, word, context):
        """
        For a given string representation of a word, and a word context,
        computes the log probability of this word in this context.
        """
        score = self.score(word, context)
        if score == 0.0:
            return float("-inf")

        return math.log(score, 2)
    
    def entropy(self, text):
        """
        Calculate the approximate cross-entropy of the n-gram model for a
        given text represented as a list of comma-separated strings.
        This is the average log probability of each word in the text.
        """
        normed_text = map(self.ngram_counter.check_against_vocab, text)
        entropy = processed_ngrams = 0
        for ngram in self.ngram_counter.to_ngrams(normed_text):
            context, word = tuple(ngram[:-1]), ngram[-1]
            entropy += self.logscore(word, context)
            processed_ngrams += 1
        return -entropy / processed_ngrams
    
    def perplexity(self, text):
        """
        Given list of comma-separated strings, calculates the perplexity
        of the text.
        """
        return 2 ** self.entropy(text)

In [19]:
%%time
trigram_model = BaseNgramModel(count_ngrams(3, vocabulary, sentences))

CPU times: user 7.18 s, sys: 145 ms, total: 7.33 s
Wall time: 7.33 s


In [20]:
trigram_model.score('I', ('<s>', '<s>'))

0.022697232272794028

In [21]:
trigram_model.logscore('I', ('<s>', '<s>'))

-5.461339805463796

In [22]:
trigram_model.entropy(sentences[0])

1.8289621948876942

In [23]:
trigram_model.perplexity(sentences[0])

3.552814082408281

In [24]:
%%time
fivegram_model = BaseNgramModel(count_ngrams(5, vocabulary, sentences))

CPU times: user 13.2 s, sys: 471 ms, total: 13.7 s
Wall time: 13.7 s


In [25]:
trigram_model.perplexity(sentences[0])

3.552814082408281

In [26]:
fivegram_model.perplexity(sentences[0])

2.229342823734018

## Unknown Words: Back-off and Smoothing

In [27]:
class AddKNgramModel(BaseNgramModel):
    """
    Provides Add-k-smoothed scores.
    """

    def __init__(self, ngram_counter, k):
        """
        Expects an input value, k, a number by which
        to increment word counts during scoring.
        """
        super().__init__(ngram_counter)
        self.k = k
        self.k_norm = len(self.ngram_counter.vocabulary) * k

    def score(self, word, context):
        """
        With Add-k-smoothing, the score is normalized with
        a k value.
        """
        context = self.check_context(context)
        context_freqdist = (
            self.ngram_counter
            .allgrams[self.ngram_counter.n][context]
        )
        word_count = context_freqdist[word]
        context_count = context_freqdist.N()
        return (word_count + self.k) / \
               (context_count + self.k_norm)

In [28]:
class LaplaceNgramModel(AddKNgramModel):
    """
    Implements Laplace (add one) smoothing.
    Laplace smoothing is the base case of Add-k smoothing,
    with k set to 1.
    """
    def __init__(self, ngram_counter):
        super().__init__(ngram_counter, k=1)

In [29]:
%%time
add_one_trigram_model = AddKNgramModel(count_ngrams(3, vocabulary, sentences), k=1)

CPU times: user 7.84 s, sys: 138 ms, total: 7.98 s
Wall time: 7.98 s


In [30]:
%%time
laplace_trigram_model = LaplaceNgramModel(count_ngrams(3, vocabulary, sentences))

CPU times: user 8.14 s, sys: 160 ms, total: 8.3 s
Wall time: 8.3 s


In [31]:
1 / len(vocabulary)

1.702185606318513e-05

In [32]:
add_one_trigram_model.score('aaa', ('bbb', 'ccc'))

1.702185606318513e-05

In [33]:
laplace_trigram_model.score('aaa', ('bbb', 'ccc'))

1.702185606318513e-05

## Language Generation