In [12]:
module CharLM

using Knet,AutoGrad,JLD
export Knet,JLD

function initialize(text, o)
    if o[:loadfile]==nothing
        vocab = Dict{Char,Int}()
        for t in text, c in t; get!(vocab, c, 1+length(vocab)); end
        model = CharLM.initweights(o[:atype], o[:hidden], length(vocab), o[:embed], o[:winit])
    else
        info("Loading model from $(o[:loadfile])")
        vocab = JLD.load(o[:loadfile], "vocab") 
        for t in text, c in t; haskey(vocab, c) || error("Unknown char $c"); end
        model = map(p->convert(o[:atype],p), JLD.load(o[:loadfile], "model"))
    end
    model, vocab
end

function train!(model, text, vocab, o)
    s0 = initstate(o[:atype], o[:hidden], o[:batchsize])
    data = map(t->minibatch(t, vocab, o[:batchsize]), text)
    lr = o[:lr]
    if o[:fast]
        @time (for epoch=1:o[:epochs]
               train1(model, copy(s0), data[1]; slen=o[:seqlength], lr=lr, gclip=o[:gclip])
               end; Knet.cudaDeviceSynchronize())
        return
    end    
    losses = map(d->loss(model,copy(s0),d), data)
    println((:epoch,0,:loss,losses...))
    devset = ifelse(length(data) > 1, 2, 1)
    devlast = devbest = losses[devset]
    for epoch=1:o[:epochs]
        @time train1(model, copy(s0), data[1]; slen=o[:seqlength], lr=lr, gclip=o[:gclip])
        @time losses = map(d->loss(model,copy(s0),d), data)
        println((:epoch,epoch,:loss,losses...))
        if o[:gcheck] > 0
            gradcheck(loss, model, copy(s0), data[1], 1:o[:seqlength]; gcheck=o[:gcheck])
        end
        devloss = losses[devset]
        if devloss < devbest
            devbest = devloss
            if o[:bestfile] != nothing
                info("Saving best model to $(o[:bestfile])")
                save(o[:bestfile], "model", model, "vocab", vocab)
            end
        end
        if devloss > devlast
            lr *= o[:decay]
            info("New learning rate: $lr")
        end
        devlast = devloss
    end
end    


# sequence[t]: input token at time t
# state is modified in place
function train1(param, state, sequence; slen=100, lr=1.0, gclip=0.0)
    for t = 1:slen:length(sequence)-slen
        range = t:t+slen-1
        gloss = lossgradient(param, state, sequence, range)
        gscale = lr
        if gclip > 0
            gnorm = sqrt(mapreduce(sumabs2, +, 0, gloss))
            if gnorm > gclip
                gscale *= gclip / gnorm
            end
        end
        gnorm = sqrt(mapreduce(sumabs2, +, 0, gloss))
        for k in 1:length(param)
            # param[k] -= gscale * gloss[k]
            Knet.axpy!(-gscale, gloss[k], param[k])
        end
        isa(state,Vector{Any}) || error("State should not be Boxed.")
        # The following is needed in case AutoGrad boxes state values during gradient calculation
        for i = 1:length(state)
            state[i] = AutoGrad.getval(state[i])
        end
    end
end

# param[2k-1,2k]: weight and bias for the k'th lstm layer
# param[end-2]: embedding matrix
# param[end-1,end]: weight and bias for final prediction
function initweights(atype, hidden, vocab, embed, winit)
    param = Array(Any, 2*length(hidden)+3)
    input = embed
    for k = 1:length(hidden)
        param[2k-1] = winit*randn(input+hidden[k], 4*hidden[k])
        param[2k]   = zeros(1, 4*hidden[k])
        param[2k][1:hidden[k]] = 1 # forget gate bias
        input = hidden[k]
    end
    param[end-2] = winit*randn(vocab,embed)
    param[end-1] = winit*randn(hidden[end],vocab)
    param[end] = zeros(1,vocab)
    return map(p->convert(atype,p), param)
end

# state[2k-1,2k]: hidden and cell for the k'th lstm layer
function initstate(atype, hidden, batchsize)
    state = Array(Any, 2*length(hidden))
    for k = 1:length(hidden)
        state[2k-1] = zeros(batchsize,hidden[k])
        state[2k] = zeros(batchsize,hidden[k])
    end
    return map(s->convert(atype,s), state)
end

function lstm(weight,bias,hidden,cell,input)
    gates   = hcat(input,hidden) * weight .+ bias
    hsize   = size(hidden,2)
    forget  = sigm(gates[:,1:hsize])
    ingate  = sigm(gates[:,1+hsize:2hsize])
    outgate = sigm(gates[:,1+2hsize:3hsize])
    change  = tanh(gates[:,1+3hsize:end])
    cell    = cell .* forget + ingate .* change
    hidden  = outgate .* tanh(cell)
    return (hidden,cell)
end

# s[2k-1,2k]: hidden and cell for the k'th lstm layer
# w[2k-1,2k]: weight and bias for k'th lstm layer
# w[end-2]: embedding matrix
# w[end-1,end]: weight and bias for final prediction
# state is modified in place
function predict(w, s, x)
    x = x * w[end-2]
    for i = 1:2:length(s)
        (s[i],s[i+1]) = lstm(w[i],w[i+1],s[i],s[i+1],x)
        x = s[i]
    end
    return x * w[end-1] .+ w[end]
end

# sequence[t]: input token at time t
# state is modified in place
function loss(param,state,sequence,range=1:length(sequence)-1)
    total = 0.0; count = 0
    atype = typeof(AutoGrad.getval(param[1]))
    input = convert(atype,sequence[first(range)])
    for t in range
        ypred = predict(param,state,input)
        ynorm = logp(ypred,2) # ypred .- log(sum(exp(ypred),2))
        ygold = convert(atype,sequence[t+1])
        total += sum(ygold .* ynorm)
        count += size(ygold,1)
        input = ygold
    end
    return -total / count
end

lossgradient = grad(loss)

function generate(param, state, vocab, nchar)
    index_to_char = Array(Char, length(vocab))
    for (k,v) in vocab; index_to_char[v] = k; end
    input = oftype(param[1], zeros(1,length(vocab)))
    index = 1
    for t in 1:nchar
        ypred = predict(param,state,input)
        input[index] = 0
        index = sample(exp(logp(ypred)))
        print(index_to_char[index])
        input[index] = 1
    end
    println()
end


function sample(p)
    p = convert(Array,p)
    r = rand()
    for c = 1:length(p)
        r -= p[c]
        r < 0 && return c
    end
end

function shakespeare()
    file = Pkg.dir("Knet","data","100.txt")
    if !isfile(file)
        info("Downloading 'The Complete Works of William Shakespeare'")
        url = "http://www.gutenberg.org/files/100/100.txt"
        download(url,file)
    end
    return file
end

function minibatch(chars, char_to_index, batch_size)
    nbatch = div(length(chars), batch_size)
    vocab_size = length(char_to_index)
    data = [ falses(batch_size, vocab_size) for i=1:nbatch ] # using BitArrays
    cidx = 0
    for c in chars            # safest way to iterate over utf-8 text
        idata = 1 + cidx % nbatch
        row = 1 + div(cidx, nbatch)
        row > batch_size && break
        col = char_to_index[c]
        data[idata][row,col] = 1
        cidx += 1
    end
    return data
end

# To be able to load/save KnetArrays:
if Pkg.installed("JLD") != nothing
    import JLD: writeas, readas
    type KnetJLD; a::Array; end
    writeas(c::KnetArray) = KnetJLD(Array(c))
    readas(d::KnetJLD) = KnetArray(d.a)
end

end



CharLM

In [20]:
o = Dict()
o[:datafiles] = ["/home/rluser/.julia/v0.5/Knet/data/101.txt"]
o[:loadfile] = nothing
o[:savefile] = "/tmp/CharLM-last.jld"
o[:bestfile] = "/tmp/CharLM-best.jld"
o[:generate] = 0
o[:hidden] = [256]
o[:embed] = 256
o[:epochs] = 3
o[:batchsize] = 128
o[:seqlength] = 50 # 100
o[:decay] = 0.9
o[:lr] = 1.0
o[:gclip] = 3.0
o[:winit] = 0.3
o[:gcheck] = 0
o[:seed] = -1
o[:atype] = Knet.KnetArray{Float32}
o[:fast] = true

true

In [21]:
text = map(readstring,o[:datafiles])
model, vocab = CharLM.initialize(text, o)
CharLM.train!(model, text, vocab, o)

  3.541751 seconds (1.84 M allocations: 86.737 MB)


In [22]:
state = CharLM.initstate(o[:atype], o[:hidden], 1)
CharLM.generate(model, state, vocab, 100)

g
1T1TTTJeql~~9~~~oJ#
