In [312]:
import numpy as np
from chainer import cuda, Function, FunctionSet, gradient_check, Variable, optimizers
import chainer.functions as F
from textblob import TextBlob
import time
VOCAB_SIZE = 100

In [273]:
import codecs
from collections import Counter
def load_data(keep=200):
    char_to_index = {}
    count = 0
    text = []
    frequencies = Counter()
    with codecs.open("responseText-0.txt","r","utf-8") as f:
        for line in f:
            text.append(line)
            for c in line:
                frequencies[c] += 1
                
    #trim rare chars
    char_to_index = {k:i for i,(k,_) in enumerate(frequencies.most_common())}
    kept_set = set(c for c,_ in frequencies.most_common(keep))
    index_to_char = {v:k for k,v in char_to_index.iteritems() if k in kept_set}
    char_to_index = {k:v if k in kept_set else keep for k,v in char_to_index.iteritems()}
    return text,char_to_index,index_to_char
texts,char_to_index,index_to_char = load_data(keep=VOCAB_SIZE-1)

In [286]:
def char_to_onehot(c,char_to_index,vocab_size):
    #result = np.zeros((VOCAB_SIZE,),dtype=np.int32)
    #result[char_to_index[c]] = 1
    return np.array((char_to_index[c],),dtype=np.int32)

def vector_to_char(v, temp=1.0):
    v = v.ravel()
    exponentiated = np.exp(v/temp)
    softmax = exponentiated/ exponentiated.sum()
    cutoff = np.random.random()
    for i,val in enumerate(softmax):
        cutoff -= val
        
        if cutoff <= 0:
            break
    try:
        return index_to_char[i]
    except KeyError:
        return "<OOV>"

In [275]:
char_to_onehot(u'\n',char_to_index,VOCAB_SIZE)

array([30], dtype=int32)

In [427]:
n_units = 300
model = FunctionSet(
    embed = F.EmbedID(VOCAB_SIZE, n_units),
    l1_x = F.Linear(n_units, 4 * n_units), 
    l1_h = F.Linear(n_units, 4 * n_units),
    l2_x = F.Linear(n_units, 4 * n_units), 
    l2_h = F.Linear(n_units, 4 * n_units),
    l3 = F.Linear(n_units, VOCAB_SIZE)
)
optimizer = optimizers.SGD(lr=0.001)
optimizer.setup(model.collect_parameters())

In [438]:
def forward_one_step(x_data, y_data, state, train=True):
    x = Variable(x_data, volatile=not train)
    t = Variable(y_data, volatile=not train)
    
    h0 = model.embed(x)
    
    h1_in = model.l1_x(F.dropout(h0, train=train)) + model.l1_h(state['h1'])
    c1, h1 = F.lstm(state['c1'], h1_in)
    h2_in = model.l2_x(F.dropout(h1, train=train)) + model.l2_h(state['h2'])
    c2, h2 = F.lstm(state['c2'], h2_in)
    y = model.l3(F.dropout(h2, train=train))
    
    state = {'c1': c1, 'h1': h1, 'c2': c2, 'h2': h2}
    return state, F.softmax_cross_entropy(y, t), y

def make_initial_state(batchsize=1, train=True):
    return {name: Variable(np.zeros((batchsize, n_units), dtype=np.float32), volatile=not train)
            for name in ('c1', 'h1', 'c2', 'h2')}

def forward_batch(texts, targets, train=True):
    state = make_initial_state(batchsize=len(texts), train=train)
    
    #zip will truncate to the shortest length
    zipped = zip(*texts)
    zipped_targets = zip(*targets)
    error = np.zeros((), dtype=np.float32)
    for i in xrange(0,len(zipped)):
        next_chunk = np.array([char_to_index[c] for c in zipped[i]], dtype=np.int32)
        next_targets = np.array([char_to_index[c] for c in zipped_targets[i]], dtype=np.int32)
        state, char_errors, _ = forward_one_step(next_chunk, next_targets, state, train=train)
        error += char_errors
    return error/(len(zipped))

def train(dataset, opt, batchsize=10):
    total_error = Variable(np.zeros((), dtype=np.float32), volatile=False)
    state = make_initial_state(batchsize=1, train=True)
    current_time = time.time()
    sentences = 0
    batch_error = 0
    accumulated = []
    for i,post in enumerate(dataset):
        if len(post) < 100:
            #we'll be truncating the batch to the smallest post in it, so make sure none are too small
            continue
        accumulated.append(post)
        if len(accumulated) == batchsize:
            #TODO: ok this part could be better, right now i make the input and outputs the same length
            #by chopping off the first and last letter respectively
            loss = forward_batch([acc[:-1] for acc in accumulated], [acc[1:] for acc in accumulated], train=True)
            accumulated = []
            batch_error += loss.data
            loss.backward()
            loss.unchain_backward() #reset between batches - maybe we need this more often? not sure if it's doing anything here
            #opt.clip_grads(clip)
            opt.update()
        if i%200 == 0:
            print i/float(len(dataset)),sample_sentence(),time.time() - current_time,batch_error
            current_time = time.time()
            batch_error = 0
               
def sample_sentence(seed='A',length=50,temp=1):
    state = make_initial_state(batchsize=1, train=False)
    string = [seed]
    for i in xrange(length):
        next_char = string[-1]
        if next_char == "<OOV>":
            next_char = " "
        in_c = char_to_onehot(next_char,char_to_index,VOCAB_SIZE)
        state,_,next_vals = forward_one_step(in_c, in_c, state, train=False)
        string.append(vector_to_char(next_vals.data,temp=temp))
    return "".join(string)

In [440]:
optimizer = optimizers.SGD(lr=0.01)
optimizer.setup(model.collect_parameters())
optimizer.zero_grads()
train(texts[:10000], optimizer, batchsize=20)

0.02 And is is of can a chobreashas fe aterd eveciantese 12.4042401314 22.59795928
0.04 Aty fin to thom posresile noc, wroverifidy aten diu 11.0946769714 19.6842820644
0.06 Agttonsse I aronely are op that soces ins is ase in 14.6745350361 22.6206822395
0.08 Abes con buto the srolitover, triton carnasoun and  13.30123806 22.1325075626
0.1 At a rompedso 21D20A't to pread shave mot datare al 12.8401789665 22.0858845711
0.12 Ar myf a rerglemy ariety a you on in to to shastsig 14.0450849533 22.2942943573
0.14 Any a in thic at momen are prot that ecic the peat  12.1910228729 19.5227386951
0.16 At stfresentse ea that pron of lea and sery to pow  16.2302150726 22.1350159645
0.18 A it the the exassic renut wiise you crestre a coni 12.6032700539 21.7852401733
0.2 A don ercthis ealt is ithat the nier suther a Aof o 13.1085960865 21.8892943859
0.22 Adr your lidurs conal morty this loentt hast the in 15.1597008705 21.5839211941
0.24 And are enmarsm yould tescropenfe care mish the lim 11.7064061165 



In [452]:
sample_sentence("l", temp=0.5)

u'le a a wenel mor to can and as in this acte and be '