## Pre-requisites

In [None]:
!pip install -U nltk

Run the code below and download the gutenburg package

In [None]:
import nltk
nltk.download()

## Get Data

In [1]:
import numpy as np
import pandas as pd
import cntk
import math
import sys

from nltk.corpus import gutenberg

kjv = gutenberg.raw('bible-kjv.txt')
chars = list(set(kjv))
feature_size = len(chars)
char_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_char = { i:ch for i,ch in enumerate(chars) }

# split data into traning and testing
test = kjv[3324518:3334015] # Malachi
train = kjv[:3324518] + kjv[3334015:] # everything else
print("training size is {} and test size is {}".format(len(train),len(test)))

def featureFromChar(ch):
    result = np.zeros(feature_size, dtype=np.float32)
    result[char_to_idx[ch]] = 1
    return result

def charFromFeature(v):
    return idx_to_char[v.tolist().index(1)]
        

training size is 4323057 and test size is 9497


### Test featureFromChar and charFromFeature

In [2]:
print("first character is {}".format(kjv[0]))
v = featureFromChar(kjv[0])
print("v = {} len = {}".format(' '.join([str(x) for x in v]), len(v)))
print("charFromFeature = {}".format(charFromFeature(v)))

first character is [
v = 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 len = 75
charFromFeature = [


## Quick review of Recurrent Neural Networks
First we calculate what to forget
$$f_t = \sigma (W_f \cdot [h_{t-1}, x_t] + b_f )$$
Then we calculate what to learn
$$i_t = \sigma (W_i \cdot [h_{t-1}, x_t] + b_i )$$
$$\tilde C_t = tanh(W_C \cdot [h_{t-1},x_t] + b_C )$$
Then we update memory
$$C_t = f_t \cdot C_{t-1} + i_t \cdot \tilde C_t$$
Finally we calculate output
$$o_t = \sigma (W_o [h_{t-1}, x_t] + b_o)$$
$$h_t = o_t \cdot tanh(C_t)$$

## Helpful Functions

In [3]:
class KJVDeserializer(cntk.io.UserDeserializer):
    def __init__(self, data, streams):
        super(KJVDeserializer, self).__init__()
        self._chunksize = 5000
        self._data = data
        
        # Create the information about streams
        # based on the user provided data
        self._streams = [cntk.io.StreamInformation(s['name'], i, 'dense', np.float32, s['shape'])
                         for i, s in enumerate(streams)]
        
        # Define the number of chunks based on the file size
        self._num_chunks = int(math.ceil((len(data)-1)/self._chunksize))
        
    def stream_infos(self):
        return self._streams

    def num_chunks(self):
        return self._num_chunks

    # Ok, let's actually get the work done
    def get_chunk(self, chunk_id):
        start = chunk_id * self._chunksize
        end = ((chunk_id + 1) * self._chunksize)+1
        if (chunk_id == self._num_chunks):
            end = len(self._data)
        data = self._data[start:end]
        datalen = len(data) - 1
        result = {}
        for i, stream in enumerate(self._streams):
            result[stream.m_name] = np.array([featureFromChar(self._data[j+i]) for j in range(datalen)], dtype=np.float32)
            
        return result

def create_reader(data, is_training=False):
    d = KJVDeserializer(data=data, streams=[dict(name='features', shape=(feature_size,)), dict(name='labels', shape=(feature_size,))])
    return cntk.io.MinibatchSource([d], randomize=False, max_sweeps= cntk.io.INFINITELY_REPEAT if is_training else 1)
    

### Test KJVDeserializer

In [4]:
import time

def test_reader(mbs):
    total_num_samples = 0
    dots = 0
    start = time.time()
    while True:
        mb = mbs.next_minibatch(128)
        if not mb:
            break
    
        total_num_samples += mb[mbs.streams.features].number_of_samples
        if total_num_samples % 12800 == 0:
            sys.stdout.write('.')
            dots += 1
            if dots > 80:
                sys.stdout.write('\n')
                dots = 0
    end = time.time()
    sys.stdout.write('\n')
    print('Total number of samples %d, speed %f samples per second' % (total_num_samples, total_num_samples/(end-start)))    
    
print("Testing train set")
test_reader(create_reader(train))
print("Testing test set")
test_reader(create_reader(test))

Testing train set
.................................................................................
.................................................................................
.................................................................................
.................................................................................
.............
Total number of samples 4323056, speed 136081.478813 samples per second
Testing test set

Total number of samples 9496, speed 81909.925418 samples per second


## Define the Model

In [54]:
#           cntk.layers.Recurrence(
#               step_function, 
#               go_backwards=default_override_or(False), 
#               initial_state=default_override_or(0), 
#               return_full_state=False, 
#               name='')
#           cntk.layers.RecurrenceFrom(
#               step_function, 
#               go_backwards=default_override_or(False), 
#               return_full_state=False, 
#               name='')
#           cntk.layers.Fold(
#               folder_function, 
#               go_backwards=default_override_or(False), 
#               initial_state=default_override_or(0), 
#               return_full_state=False, 
#               name='')
#           cntk.layers.UnfoldFrom(
#               generator_function, 
#               until_predicate=None, 
#               length_increase=1, 
#               name='')
        
        
#           cntk.layers.LSTM(
#               shape, 
#               cell_shape=None, 
#               activation=default_override_or(tanh), 
#               use_peepholes=default_override_or(False),
#               init=default_override_or(glorot_uniform()), 
#               init_bias=default_override_or(0),
#               enable_self_stabilization=default_override_or(False),
#               name='')
#           cntk.layers.GRU(
#               shape, 
#               cell_shape=None, 
#               activation=default_override_or(tanh),
#               init=default_override_or(glorot_uniform()), 
#               init_bias=default_override_or(0),
#               enable_self_stabilization=default_override_or(False),
#               name='')
#           cntk.layers.RNNStep(
#               shape, 
#               cell_shape=None, 
#               activation=default_override_or(sigmoid),
#               init=default_override_or(glorot_uniform()),
#               init_bias=default_override_or(0),
#               enable_self_stabilization=default_override_or(False),
#               name='')

def create_model(x):
#    with cntk.layers.default_options(initial_state = 0.1):
    m = cntk.layers.Sequential([
        cntk.layers.Stabilizer(),
        cntk.layers.Recurrence(cntk.layers.LSTM(feature_size), name='RecurrenceLayer1'),
        cntk.sequence.last,
#        cntk.layers.Dropout(0.2, name='DropoutLayer'),
        cntk.layers.Dense(feature_size, activation=None, name='DenseLayer')
    ])
    return m(x)

# Single RNN LSTM
# (Min Loss: 3.4094 Min Error: 0.64)

# Single RNN LSTM isolating last sequence                       
# (Min Loss: 3.4094 Min Error: 0.65)

# Single RNN LSTM isolating last sequence into a Dense Layer    
# (Min Loss: 1.9809 Min Error: 0.58)

# Single RNN GRU                                                
# (Min Loss: 3.0813 Min Error: 0.60)

# Single RNN GRU isolating last sequence                        
# (Min Loss: 3.0812 Min Error: 0.60)

# Single RNN GRU isolating last sequence into a Dense Layer     
# (Min Loss: 1.9719 Min Error: 0.58)

# Single RNN RNNStep                                            
# (Min Loss: 3.7362 Min Error: 0.61)
# Single RNN RNNStep isolating last sequence                    
# (Min Loss: 3.7349 Min Error: 0.61)

# Single RNN RNNStep isolating last sequence into a Dense Layer 
# (Min Loss: 2.0307 Min Error: 0.58)


In [55]:
dynamic_feature_axes = [cntk.Axis.default_batch_axis(), cntk.Axis.default_dynamic_axis()]
features = cntk.input_variable(shape=(feature_size, ), dynamic_axes=dynamic_feature_axes)

#seq_axis = cntk.Axis('inputAxis')
#features = cntk.sequence.input_variable(shape=(feature_size, ), sequence_axis=seq_axis)

model = create_model(features)

cntk.logging.log_number_of_parameters(model)

labels = cntk.input_variable(shape=feature_size, dynamic_axes=model.dynamic_axes)
#labels = cntk.sequence.input_variable(shape=feature_size, sequence_axis=seq_axis)

loss = cntk.cross_entropy_with_softmax(model, labels)
label_error = cntk.classification_error(model, labels)

learning_rate = 0.05

minibatch_size = 100
samples_per_sweep = len(train)
sweeps = 50000
minibatches_to_train = (samples_per_sweep * sweeps) / minibatch_size

lr_schedule = cntk.learning_rate_schedule(learning_rate, cntk.UnitType.minibatch)

#learner = cntk.sgd(model.parameters, lr_schedule)

momentum_time_constant = cntk.momentum_as_time_constant_schedule(minibatch_size / -math.log(0.9)) 
learner = cntk.fsadagrad(model.parameters, 
                      lr = lr_schedule, 
                      momentum = momentum_time_constant, 
                      unit_gain = True)

trainer = cntk.Trainer(model, (loss, label_error), [learner])

def print_training_progress(trainer, mb, frequency, verbose=1):
    training_loss = "NA"
    eval_error = "NA"

    if mb % frequency == 0:
        training_loss = trainer.previous_minibatch_loss_average
        eval_error = trainer.previous_minibatch_evaluation_average
        if verbose: 
            print ("Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}".format(mb, training_loss, eval_error))
        
    return mb, training_loss, eval_error

reader_train = create_reader(train, True)

input_map = {
    labels: reader_train.streams.labels,
    features: reader_train.streams.features
}

training_progress_output_freq = 5000

out = cntk.softmax(model)


Training 51001 parameters in 6 parameter tensors.


In [56]:
for i in range(int(minibatches_to_train)):
    data = reader_train.next_minibatch(minibatch_size, input_map=input_map)
    trainer.train_minibatch(data)
    batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq)
    if i % 50000 == 0:
        print("-----------")
        ch = '1'
        for j in range(1000):
            feature = featureFromChar(ch)
            output = out.eval(feature)
            sys.stdout.write(ch)
            ch = idx_to_char[np.random.choice(range(feature_size), p=output[0].ravel())]
        print("\n-----------")


Minibatch: 0, Loss: 4.3236, Error: 1.00
-----------
10N,UT;2]vVGtr
ruZ
auC62fDcgQp99jSvZ8]icFnZlOS]yU2a,zv43Z(nGyoiI:iDueHr!A:qtqWe)AP,z8H'o3LxIEWl!?g13zJEEyYj9WC;2aZbtBDi-3m8iZ7pZDVsb]-',CRZRSIRh,?18W]sWp!PiG6)P'chrFnI26.]nTPEv7Bh'H)3n-QsyQi)gxt.v4vYD0xJrR3LFPe7Mgi5!AnLzWsK).f WxsbeF7Y0c:59FnY31p.hjotI?RcK mn,Wkww7qsA7[Ho6c
tpac5ZqLJVFPedsoC:8I;l4f AIyGzz2W-jclzln;Kf1ZWzNdaRC1Q7 NkBNE'O?vJ
JrGkCcxz[VYZsPS-Ly5;0Q1dVRyMJkBZFdm
VK05fzx0ZE:86reA7l(,0t.t07g19iL(FTeJ,ZM6b[Pfwkc-[L:WQmFR5uj,h2npu:i(1[i'tf4WmogrPc'OdAipPKOVQdGj
TltVOa'Jf4BH)5o4 nhif,s3Phpdau1kLGGnmIxsC9.H]
u(w65Ag
FWHKE]Y30xNsRnC4)5VISIti]?CupDtm15:RvuT3KvsIwU?YR?32E.a3DlF!!pfab
'AivmyKcCBsxu[(JuIy?WU17FImzM kfbAOk,hEm9bCP7HnBtkIC6o!]3P[7G[q))UEHu9tEk],eU:BkDiD7-HFunMe.!y89ZSDQOnlbCH?P-Ntkjxo2qJvBtDU]t3mKdIsnm8nNxMvT.:uU0l?l TBr9U(qLwRdW1ib0F4K.sO?x(vnZb5TeRI2)A s2]SmPc3Uzkh]n-(o4)2warMPij5,50-8qxU]H.MsPyFzyCeDbCu49[]POHUicnx,MOusAT0izM(jnEKlfwr]VKVOCP  .':K(kQ20dq
d6IHen709sRSGZiDDGs4tIBZ60b24x!IYR;3b0Cvsgt0,f?2L9(q[o]Zth-Bf

KeyboardInterrupt: 