In [1]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import tensorflow as tf
import random
from six.moves import range
import string
import os
import zipfile
from six.moves.urllib.request import urlretrieve


In [8]:
filename = 'text8.zip'

def read_data(filename):
    with zipfile.ZipFile(filename)as f:
        name = f.namelist()[0]
        data = tf.compat.as_str(f.read(name))
    return data

text = read_data(filename)

In [11]:
# Now create a validation set
valid_size=1000
valid_text= text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])


99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [51]:
#Utility func for char_to_id and vice versa

vocabulary_size = len(string.ascii_lowercase) + 1
first_letter_int  = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter_int + 1
    elif char==' ':
        return 0
    else:
        print('Unexpected Character %s' %char)
        return 0
    
def id2char(id):
    if id>0 :
        return chr(id + first_letter_int -1)
    else:
        return ' '

In [22]:
print(char2id(','))
print(id2char(1))

Unexpected Character ,
0
a


In [46]:
# Func to generate a training batch for LSTM Model

batch_size=64
num_unrollings=10
class BatchGenerator(object):
    def __init__(self,text,batch_size,num_unrollings):
        self._text=text
        self._num_unrollings = num_unrollings
        self._text_size = len(text)
        self._batch_size = batch_size
        num_batches = self._text_size // batch_size
        self._cursor = [offset *  num_batches for offset in range(batch_size)]
        self._last_batch = self._next_batch()
    
    def _next_batch(self):
        batch = np.zeros(shape=(self._batch_size,vocabulary_size),dtype=np.float)
        for b in range(self._batch_size):
            batch[b,char2id(self._text[self._cursor[b]])]=1.0
            self._cursor[b] = (self._cursor[b]+1) % self._text_size
        return batch
    
    def next(self):
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches
    
def characters(probabilities):
    return [id2char(i) for i in np.argmax(probabilities,1)]
    
def batches2string(batches):
    s=['']*batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s,characters(b))]
    return s

train_batches = BatchGenerator(train_text,batch_size,num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)



In [49]:
def logprob(predictions,labels):
    predictions[predictions< 1e-10] = 1e-10
    return np.sum(np.multiply(labels,-np.log(predictions)))/labels.shape[0]


def sample_distribution(distribution):
    r = random.uniform(0,1)
    s=0
    for i in range(len(distribution)):
        s += distribution[i]
        if s > r:
            return i
    return len(distribution)-1

def sample(prediction):
    p = np.zeros(shape=[1,vocabulary_size],dtype=np.float)
    p[0,sample_distribution(prediction[0])] = 1.0
    return p

def random_distribution():
    b = np.random.uniform(0.0,1.0,size=[1,vocabulary_size])
    return b/ np.sum(b,1)[:,None]


In [66]:
# Now we'll design a simple LSTM Model


num_nodes=64
graph = tf.Graph()
with graph.as_default():
    # Define hyper params
    # Input gate: inp,prev,bias
    ix = tf.Variable(tf.truncated_normal([vocabulary_size,num_nodes],-0.1,0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    ib = tf.Variable(tf.zeros([1,num_nodes]))
    
    # Forget gate: inp,prev,bias
    fx = tf.Variable(tf.truncated_normal([vocabulary_size,num_nodes],-0.1,0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    fb = tf.Variable(tf.zeros([1,num_nodes]))
    
    # Memory cell: inp,state,bias
    cx = tf.Variable(tf.truncated_normal([vocabulary_size,num_nodes],-0.1,0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    cb = tf.Variable(tf.zeros([1,num_nodes]))
    
    #Output Gate : inp,prev,bias
    ox = tf.Variable(tf.truncated_normal([vocabulary_size,num_nodes],-0.1,0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    ob = tf.Variable(tf.zeros([1,num_nodes]))
    
    #Variables saving states across unrollings
    saved_output = tf.Variable(tf.zeros([batch_size,num_nodes]),trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Classifier weights and bias
    w = tf.Variable(tf.truncated_normal([num_nodes,vocabulary_size],-0.1,0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    
    #Def of cell computation
    def lstm_cell(i,o,state):
        input_gate = tf.sigmoid(tf.matmul(i,ix) + tf.matmul(o,im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i,fx)+tf.matmul(o,fm) + fb)
        update = tf.matmul(i,cx)+tf.matmul(o,cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i,ox)+tf.matmul(o,om)+ob)
        return output_gate * tf.tanh(state), state
    
    train_data = list()
    for _ in range(num_unrollings+1):
        train_data.append(tf.placeholder(dtype=tf.float32,shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:] # labels are inputs shifted by one time step.
    
    #Unrolled lstm loop
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output,state = lstm_cell(i,output,state)
        outputs.append(state)
    
    # State savings across unrollings.
    with tf.control_dependencies([saved_output.assign(output),saved_state.assign(state)]):
        logits = tf.nn.xw_plus_b(tf.concat(outputs,axis=0),w,b)
        loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels,axis=0),logits=logits))
        
    
    #Optimizer
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(10.0,global_step,5000,0.1,staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients,v = zip(*optimizer.compute_gradients(loss)) # * here to unpack coz comp_grads(..) returns list(tuple(gradient,variable))
    gradients,_ = tf.clip_by_global_norm(gradients,1.25) # deal with exploding gradient problem
    
    optimizer = optimizer.apply_gradients(zip(gradients,v),global_step=global_step)
    
    
    # predictions
    train_prediction = tf.nn.softmax(logits)
    
    #sampling and validating eval..no unrolling since not back prop
    
    sample_input  = tf.placeholder(shape=[1,vocabulary_size],dtype=tf.float32)
    saved_sample_output = tf.Variable(tf.zeros([1,num_nodes]))
    saved_sample_state =  tf.Variable(tf.zeros([1,num_nodes]))
    
    reset_sample_state=tf.group(saved_sample_output.assign(tf.zeros([1,num_nodes])),saved_sample_state.assign(tf.zeros([1,num_nodes])))
    sample_output,sample_state = lstm_cell(sample_input,saved_sample_output,saved_sample_state)
    
    with tf.control_dependencies([saved_sample_output.assign(sample_output),saved_sample_state.assign(sample_state)]):
        sample_prediction  = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b)) # <- This guy is the fk'n classifier ..!! ding dong
        
        
    

 <b>Refer to this diagram for above implementation details of lstm_cell [Forget,Input,Update(Memory),Output &lt;- Gates in this order]</b><br><img src='lstm.png' width=600px  height=400px/></br>
<br>
 <p style="border-style:1px double red;border-radius: 10px;border-width: medium;" > 
 &nbsp;&nbsp;<span style="color:green"><b> def </b></span> <span style="color:blue">lstm_cell</span>(i,o,state):<br>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; input_gate = tf.sigmoid(tf.matmul(i,ix) + tf.matmul(o,im) + ib) <br>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; forget_gate = tf.sigmoid(tf.matmul(i,fx)+tf.matmul(o,fm) + fb) <br>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; update = tf.matmul(i,cx)+tf.matmul(o,cm) + cb <br>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; state = forget_gate * state + input_gate * tf.tanh(update) <br>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; output_gate = tf.sigmoid(tf.matmul(i,ox)+tf.matmul(o,om)+ob) <br>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; return output_gate * tf.tanh(state), state <br>
 </p>

In [67]:
num_steps = 7001
summary_frequency= 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Global vars initialized')
    mean_loss = 0
    
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = {}
        for i in range(num_unrollings+1):
            feed_dict[train_data[i]] = batches[i]
        _,l,predictions,lr = session.run([optimizer,loss,train_prediction,learning_rate],feed_dict=feed_dict)
        mean_loss += 1
        if step%summary_frequency==0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss=0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity : %.2f' %float( np.exp(logprob(predictions,labels))))
        if step % (summary_frequency * 10) ==0:
            # generate samples
            print('=' * 80)
            for _ in range(5):
                sentence = characters(feed)[0]
                feed = sample(random_distribution())
                sentence += characters(feed)[0]
                reset_sample_state.run()
                for _ in range(80):
                    prediction = sample_prediction.eval({sample_input:feed})
                    feed = sample(prediction)
                    sentence += characters(feed)[0]
                print(sentence)    
            print('=' * 80)
            #measure validation set perplexity
            
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
                
            print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))
    

Global vars initialized
Average loss at step 0: 1.000000 learning rate: 10.000000
Minibatch perplexity : 27.02
  sb s  sbcq  prg i w h vldumiehr ewuhchncepehqsnu fvmye r kei phityylza k bhnsmav
vsq ezogbtdanku p o pegmihk  nbaa zddev  cieuy  hpfeuy  key  esr ir ayo xl tvxsmlt
tueml qmn cnitcx aihoez dfamqsw jbeig   yajzop fiqzzexwi l q tttyhv  sesozvbge vcr
rih vcty bkz s  u rfvowdn kedvwuoa oezs uxblc     dntiant  vet y zvlrse atuhpsxozy
yystxtucyqcmrfxheo  ot l ersl lxexs rhxnweeqrmeterl m osjai   c  rtwy wra dvsu iug
Validation set perplexity: 20.03
Average loss at step 100: 1.000000 learning rate: 10.000000
Minibatch perplexity : 11.25
Average loss at step 200: 1.000000 learning rate: 10.000000
Minibatch perplexity : 9.27
Average loss at step 300: 1.000000 learning rate: 10.000000
Minibatch perplexity : 7.53
Average loss at step 400: 1.000000 learning rate: 10.000000
Minibatch perplexity : 8.54
Average loss at step 500: 1.000000 learning rate: 10.000000
Minibatch perplexity : 7.26


Average loss at step 5100: 1.000000 learning rate: 1.000000
Minibatch perplexity : 4.90
Average loss at step 5200: 1.000000 learning rate: 1.000000
Minibatch perplexity : 5.40
Average loss at step 5300: 1.000000 learning rate: 1.000000
Minibatch perplexity : 5.11
Average loss at step 5400: 1.000000 learning rate: 1.000000
Minibatch perplexity : 4.64
Average loss at step 5500: 1.000000 learning rate: 1.000000
Minibatch perplexity : 4.79
Average loss at step 5600: 1.000000 learning rate: 1.000000
Minibatch perplexity : 4.73
Average loss at step 5700: 1.000000 learning rate: 1.000000
Minibatch perplexity : 6.01
Average loss at step 5800: 1.000000 learning rate: 1.000000
Minibatch perplexity : 5.06
Average loss at step 5900: 1.000000 learning rate: 1.000000
Minibatch perplexity : 5.46
Average loss at step 6000: 1.000000 learning rate: 1.000000
Minibatch perplexity : 5.28
ty lyy amecon in unrilabsedry in doets hypehia cak ordicio the ocplan acrop demmie
eentileel enicungible riebetar the we