In [312]:
import numpy as np
from chainer import cuda, Function, FunctionSet, gradient_check, Variable, optimizers
import chainer.functions as F
from textblob import TextBlob
import time
from itertools import izip_longest
VOCAB_SIZE = 150
DROPOUT = 0.1
CLIP = 5

In [None]:
import os
def file_split_generator(fn,n,yieldsize):
    size = os.path.getsize(fn)
    cutoffs = [i * size/n for i in xrange(n)]
    offset = 0
    #open as ascii for now; utf-8 makes this not work
    with open(fn,"r") as f:
        while True:
            batches = []
            result = []
            if cutoffs[0] + offset + yieldsize >= cutoffs[1]:
                raise StopIteration
            for cutoff in cutoffs:
                f.seek(cutoff + offset)
                result.append(f.read(yieldsize))
            offset += yieldsize
            yield result

In [273]:
import codecs
from collections import Counter
def get_vocabulary(fn, keep=200):
    char_to_index = {}
    count = 0
    text = []
    frequencies = Counter()
    with open(fn,"r") as f:
        for line in f:
            text.append(line)
            for c in line:
                frequencies[c] += 1
    print len(frequencies)
                
    #trim rare chars
    char_to_index = {k:i for i,(k,_) in enumerate(frequencies.most_common())}
    kept_set = set(c for c,_ in frequencies.most_common(keep))
    index_to_char = {v:k for k,v in char_to_index.iteritems() if k in kept_set}
    char_to_index = {k:v if k in kept_set else keep for k,v in char_to_index.iteritems()}
    return char_to_index,index_to_char

char_to_index,index_to_char = get_vocabulary("data/response_train.txt", keep=VOCAB_SIZE-1)

with open("data/response_valid.txt","r") as f:
    validation_texts = [line[:-1] for line in f]

In [286]:
def char_to_onehot(c,char_to_index,vocab_size):
    return np.array((char_to_index[c],),dtype=np.int32)

def vector_to_char(v, temp=1.0):
    v = v.ravel()
    exponentiated = np.exp(v/temp)
    softmax = exponentiated/ exponentiated.sum()
    cutoff = np.random.random()
    for i,val in enumerate(softmax):
        cutoff -= val
        if cutoff <= 0:
            break
    try:
        return index_to_char[i]
    except KeyError:
        return "<OOV>"

In [427]:
n_units = 500
model = FunctionSet(
    embed = F.EmbedID(VOCAB_SIZE, n_units),
    l1_x = F.Linear(n_units, 4 * n_units), 
    l1_h = F.Linear(n_units, 4 * n_units),
    l2_x = F.Linear(n_units, 4 * n_units), 
    l2_h = F.Linear(n_units, 4 * n_units),
    l3 = F.Linear(n_units, VOCAB_SIZE)
)

In [438]:
import chainer.computational_graph as c

def forward_one_step(x_data, y_data, state, train=True):
    x = Variable(x_data, volatile=not train)
    t = Variable(y_data, volatile=not train)
    
    h0 = model.embed(x)
    
    h1_in = model.l1_x(F.dropout(h0, ratio=DROPOUT, train=train)) + model.l1_h(state['h1'])
    c1, h1 = F.lstm(state['c1'], h1_in)
    h2_in = model.l2_x(F.dropout(h1, ratio=DROPOUT, train=train)) + model.l2_h(state['h2'])
    c2, h2 = F.lstm(state['c2'], h2_in)
    y = model.l3(F.dropout(h2, ratio=DROPOUT, train=train))
    
    state = {'c1': c1, 'h1': h1, 'c2': c2, 'h2': h2}
    return state, F.softmax_cross_entropy(y, t), y

def make_initial_state(batchsize=1, train=True):
    return {name: Variable(np.zeros((batchsize, n_units), dtype=np.float32), volatile=not train)
            for name in ('c1', 'h1', 'c2', 'h2')}

def forward_batch(texts, targets, train=True, init_state=None):
    if init_state is None:
        state = make_initial_state(batchsize=len(texts), train=train)
    else:
        state = init_state
    
    #zip will truncate to the shortest length
    zipped = zip(*texts)
    zipped_targets = zip(*targets)
    error = np.zeros((), dtype=np.float32)
    for i in xrange(0,len(zipped)):
        next_chunk = np.array([char_to_index[c] for c in zipped[i]], dtype=np.int32)
        next_targets = np.array([char_to_index[c] for c in zipped_targets[i]], dtype=np.int32)
        state, char_errors, _ = forward_one_step(next_chunk, next_targets, state, train=train)
        error += char_errors
    return error/(len(zipped)), state

def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return izip_longest(*args, fillvalue=fillvalue)

def evaluate(dataset):
    error = 0
    for i,group in enumerate(grouper(dataset, 10)):
        group = [g for g in group if g is not None and len(g) > 2]
        source = [g[:-1] for g in group]
        target = [g[1:] for g in group]
        error += forward_batch(source,target,train=False)[0]
    return error.data / len(dataset)

def train(train_fn, valid, epochs, opt, batchsize=10, eval_count=2000, seq_length=100):
    total_error = Variable(np.zeros((), dtype=np.float32), volatile=False)
    state = make_initial_state(batchsize=1, train=True)
    current_time = time.time()
    sentences = 0
    batch_error = 0
    iterations = 0
    accumulated = []
    state = None
    for e in xrange(epochs):
        for i,batch in enumerate(file_split_generator(train_fn, batchsize, seq_length)):
            iterations += 1
            if random.random() < 0.05:
                #make the model learn how to handle the stateless situation
                #i assume there's a much better way to do this
                state = None 
            loss,state = forward_batch([text[:-1] for text in batch], [text[1:] for text in batch], train=True, init_state=state)
            print i,loss.data
            accumulated = []
            batch_error += loss.data
            opt.zero_grads()
            loss.backward()
            loss.unchain_backward()
            opt.clip_grads(CLIP)
            opt.update()
            if iterations%eval_count == 0:
                valid_error = evaluate(valid)
                print "({}.{}) - {:.6f} (took: {:.2f}s): {}".format(e, i, valid_error,time.time() - current_time,sample_sentence())
                current_time = time.time()
                batch_error = 0
            
def get_graph(text):
    loss = forward_batch([text],[text])
    g = c.build_computational_graph([loss])
    return g
               
def sample_sentence(seed='A',length=50,temp=1):
    state = make_initial_state(batchsize=1, train=False)
    string = []
    for s in seed:
        next_char = s
        in_c = char_to_onehot(next_char,char_to_index,VOCAB_SIZE)
        state,_,next_vals = forward_one_step(in_c, in_c, state, train=False)
        string.append(s)
    for i in xrange(length):
        next_char = string[-1]
        if next_char == "<OOV>":
            next_char = " "
        in_c = char_to_onehot(next_char,char_to_index,VOCAB_SIZE)
        state,_,next_vals = forward_one_step(in_c, in_c, state, train=False)
        string.append(vector_to_char(next_vals.data,temp=temp))
    return "".join(string)

In [1]:
#optimizer = optimizers.SGD(lr=0.02)
optimizer = optimizers.Adam()
optimizer.setup(model.collect_parameters())
optimizer.zero_grads()
train("data/response_train.txt", validation_texts, 10, optimizer, batchsize=50, eval_count=100)

NameError: name 'optimizers' is not defined

In [452]:
sample_sentence("l", temp=0.5)

u'le a a wenel mor to can and as in this acte and be '