In [296]:
module ngram

import Iterators: partition

function cleanup_corpus(data::String)
    data = lowercase(data)
    data = replace(data, r"[^a-z ]", " ")
    data = replace(data, r"\s+", " ")
    collect(data)
end

function count_ngrams(corpus, n::Integer=2)
    count_ngrams(corpus, Val{n})
end

tokens(corpus) = sort(collect(Set(corpus)))
    
immutable SlidingWindow{T}
    iter::T
    window_size::Int
end

Base.start(::SlidingWindow) = 1
Base.done(sw::SlidingWindow, state) = state + sw.window_size - 1 > length(sw.iter)
Base.next(sw::SlidingWindow, state) = view(sw.iter, state:(state + sw.window_size - 1)), state + 1
Base.length(sw::SlidingWindow) = length(sw.iter) - sw.window_size + 1

immutable NGramFrequencies{N, T}
    frequencies::Array{Float64, N}
    tokens::Vector{T}
    token_to_index::Dict{T, Int}
end

token_to_index{N, T}(freq::NGramFrequencies{N, T}, t::T) = freq.token_to_index[t]
token_to_index(freq::NGramFrequencies, t::Colon) = t

function Base.getindex{N, T}(freq::NGramFrequencies{N, T}, I::Vararg{Any, N})
    getindex(freq.frequencies, [token_to_index(freq, i) for i in I]...)
end

function Base.view{N, T}(freq::NGramFrequencies{N, T}, I::Vararg{Any, N})
    view(freq.frequencies, [token_to_index(freq, i) for i in I]...)
end

@generated function count_ngrams{N}(corpus, n::Type{Val{N}})
    quote
        tok = tokens(corpus)
        token_indices = Dict(zip(tok, 1:length(tok)))
        counts::Array{Int64, N} = zeros(Int64, $(Expr(:tuple, [:(length(tok)) for i in 1:N]...)))
        for toks in SlidingWindow(corpus, N)
            inds = [token_indices[t] for t in toks]
            $(Expr(:ref, :counts, [:(inds[$i]) for i in 1:N]...)) += 1
        end
        counts .+= 1
        NGramFrequencies(log.((counts) ./ sum(counts)), tok, token_indices)
    end
end

function frequency{N, T}(freqs::NGramFrequencies{N, T}, toks::AbstractVector{Nullable{T}})
    log(mean(exp.(view(freqs, [isnull(toks[i]) ? (:) : get(toks[i]) for i in 1:length(toks)]...))))
end

function likelihood{N, T}(freqs::NGramFrequencies{N, T}, str::String)
    sum(freqs[toks...] for toks in SlidingWindow(collect(str), N))
end

function likelihood{N, T}(freqs::NGramFrequencies{N, T}, str::AbstractVector{Nullable{T}})
    sum(frequency(freqs, toks) for toks in SlidingWindow(str, N))
end
        

function likelihood{N, T}(freqs::NGramFrequencies{N, T}, str::AbstractVector)
    likelihood(freqs, convert(Vector{Nullable{T}}, str))
end    
    
end



ngram

In [297]:
corpus = ngram.cleanup_corpus(readstring(open("data/mobydick.txt")));

In [298]:
freqs = ngram.count_ngrams(corpus, 2)

ngram.NGramFrequencies{2,Char}([-13.9895 -3.89265 … -6.12237 -10.406; -5.3453 -12.1977 … -6.31162 -8.74245; … ; -4.55672 -8.3025 … -13.9895 -13.9895; -11.5046 -10.0192 … -10.7706 -10.2053],[' ','a','b','c','d','e','f','g','h','i'  …  'q','r','s','t','u','v','w','x','y','z'],Dict('g'=>8,'a'=>2,'d'=>5,'l'=>13,'m'=>14,'p'=>17,' '=>1,'q'=>18,'b'=>3,'t'=>21…))

In [299]:
ngram.frequency(freqs, [Nullable('t'), Nullable()])

-5.882450823928156

In [300]:
ngram.likelihood(freqs, "salkdjf lkjsdf wjejrs")

-189.8376362654289

In [301]:
ngram.likelihood(freqs, [Nullable('t'), Nullable('h'), Nullable()])

-9.8509186931655

In [302]:
ngram.likelihood(freqs, ['t', 'h', 'y'])

-11.71628210227854