# N-Grams

In [6]:
# import kenlm
import re
import os
import string
import util
import itertools
from helpers import product

First, let's consider a *unigram* model, where the probability of seeing each word is independent of the words before and after it.
$$P(W1:n) = Πk=1:nP(Wk)$$

In [7]:
# Open the file
with open('count_unigrams.txt','r') as f:
    unigram_counts = f.readlines()

In [8]:
'''
This function takes in a word and a count and splits it into the word and the integer.
ex) parse_word_count('the\t23135851162\n') = (the, 23135851162)
'''
def parse_word_count(string):
    return (string.split("\t")[0], int(string.split("\t")[1].split("\n")[0]))

In [9]:
# A dictionary with (word: probability) pairs for 1/3 million most frequent English words 
unigram_dict = {parse_word_count(line)[0]:parse_word_count(line)[1] for line in unigram_counts}

Now we need to create a probability distribution dictionary `unigram_probs` from `unigram_dict`. But we will examine many, many "words" that are not truly words. Our `unigram_probs` dictionary must return a very small value in that case. (We do not want to simply return 0, because Jane Austen may use her own words (made up words, proper nouns, etc.) that are not in our data set. Instead, following Peter Norvig (CITE), we'll create a class that returns a probability given a string. 

In [73]:
class Unigram_Prob(dict):
    def __init__(self, data, N, fn):
        for key,count in data.iteritems():
            self[key] = count
        self.N = float(N)
        self.fn = fn
    def __call__(self, key):
        if key in self:
            return self[key]/self.N
        else: 
            return self.fn(key, self.N)


'''
This function returns a probailtiy on unrecognized words. 
It makes it more unlikely for long unrecognized words to be used than short unrecognized words.
The probability is inversely proportional to the length of the unrecognized word.
'''
def avoid_long_words(word, N):
    return 10./(N * 10**len(word))

N = 1024908267229 ## Number of tokens in corpus
Pw = Unigram_Prob(unigram_dict, N, avoid_long_words)

In [74]:
'''
This memoizing function caches the results of previous calls to the segment
function so that each results doesn't have to be recomputed.

ex) segment("helloworld") doesn't have to compute segment("lloworld") except for once.

The memoizing function helps segment only call itself n times, rather than 2^n times.
'''
def memoize(function):
    memo = {}
    def helper(x):
        if x not in memo.keys():
            memo[x] = function(x)
        return memo[x]
    return helper

In [75]:
'''
Parameters
    Corpus: The flattened text which we need to segment.
Output
    A list of words, separated according to our probability model.

Ex) segment('helloworld') = ['hello','world']
'''
@memoize
def segment(corpus):
    if not corpus: 
        return []
    candidates = tuple([first]+segment(remaining) for first,remaining in splits(corpus))
    return max(candidates, key=Pwords)

def splits(text, L=20):
    '''
    Return a list of all possible splits of the text, where length(first word)<=L.
    '''
    return [(text[:i+1], text[i+1:]) for i in range(min(len(text), L))]

def Pwords(words):
    '''
    The Naive Bayes probability of a sequence of words.
    '''
    return product([uProb(w) for w in words])

def uProb(word):
    '''
    Returns the unigram probability of a word by consulting unigram data.
    '''
    return Pw.__call__(word)
    
    

In [76]:
decl = 'wheninthecourseofhumanevents'

In [77]:
segment(decl)

['when', 'in', 'the', 'course', 'of', 'human', 'events']