In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

def read_data(filename):
  f = zipfile.ZipFile(filename)
  for name in f.namelist():
    return tf.compat.as_str(f.read(name))
  f.close()
  
text = read_data(filename)
print('Data size %d' % len(text))

valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:1024])
print(valid_size, valid_text[:100])

vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    #print 'batch idx %i' % 
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (mostl likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, 128, 54)
valid_batches = BatchGenerator(valid_text, 1, 1)
print train_batches._cursor

batch = train_batches.next()
print batches2string([batch[0]])
print(batches2string(batch))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction, size=vocabulary_size):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution(size=vocabulary_size):
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, size])
  return b/np.sum(b, 1)[:,None]

Found and verified text8.zip
Data size 100000000
(99999000, 'ons anarchists advocate social relations based upon voluntary association of autonomous individuals mutual aid and self governance while anarchism is most easily defined by what it is against anarchists also offer positive visions of what they believe to be a truly free society however ideas about how an anarchist society might work vary considerably especially with respect to economics there is also disagreement about how a free society might be brought about origins and predecessors kropotkin and others argue that before recorded history human society was organized on anarchist principles most anthropologists follow kropotkin and engels in believing that hunter gatherer bands were egalitarian and lacked division of labour accumulated wealth or decreed law and had equal access to resources william godwin anarchists including the the anarchy organisation and rothbard find anarchist attitudes in taoism from ancient china kropo

Two layers character bases LSTM

In [20]:
def create_lstm_graph_bm(num_nodes1, num_nodes2, num_unrollings, batch_size):
    with tf.Graph().as_default() as g:
        # cell variables - input, memory, biases        
        lstm1_x = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes1*4], -0.1, 0.1), name='lstm1_x')
        lstm1_m = tf.Variable(tf.truncated_normal([num_nodes1, num_nodes1*4], -0.1, 0.1), name='lstm1_m')
        lstm1_b = tf.Variable(tf.zeros([1, num_nodes1*4]), name='lstm1_b')
        lstm2_x = tf.Variable(tf.truncated_normal([num_nodes1, num_nodes2*4], -0.1, 0.1), name='lstm2_x')
        lstm2_m = tf.Variable(tf.truncated_normal([num_nodes2, num_nodes2*4], -0.1, 0.1), name='lstm2_m')
        lstm2_b = tf.Variable(tf.zeros([1, num_nodes2*4]), name='lstm2_b')
        # Variables saving state across unrollings.
        lstm1_saved_output = tf.Variable(tf.zeros([batch_size, num_nodes1]), trainable=False)
        lstm2_saved_output = tf.Variable(tf.zeros([batch_size, num_nodes2]), trainable=False)
        lstm1_saved_state = tf.Variable(tf.zeros([batch_size, num_nodes1]), trainable=False)
        lstm2_saved_state = tf.Variable(tf.zeros([batch_size, num_nodes2]), trainable=False)
        # Classifier weights and biases.
        w = tf.Variable(tf.truncated_normal([num_nodes2, vocabulary_size], -0.1, 0.1))
        b = tf.Variable(tf.zeros([vocabulary_size]))

        # Definition of the cell computation.
        def lstm_cell(i, o, state, x, m, b, num_nodes):                                    
            #multiply in one operation then split matrix between gates            
            mult = tf.matmul(i, x) + tf.matmul(o, m) + b
            input_gate = tf.sigmoid(mult[:,:num_nodes])
            forget_gate = tf.sigmoid(mult[:,num_nodes:num_nodes*2])
            update = mult[:,num_nodes*3:num_nodes*4]
            state = forget_gate * state + input_gate * tf.tanh(update)
            output_gate = tf.sigmoid(mult[:,num_nodes*3:])
            return output_gate * tf.tanh(state), state

        # Input data. [num_unrollings, batch_size, vocabulary_size]
        tf_train_data = tf.placeholder(tf.float32, shape=[None, None, vocabulary_size], name='tf_train_data')
        train_data = list()
        for i in tf.split(0, num_unrollings + 1, tf_train_data):
            train_data.append(tf.squeeze(i))
        train_inputs = train_data[:num_unrollings]
        train_labels = train_data[1:]  # labels are inputs shifted by one time step.

        # Unrolled LSTM loop.
        outputs = list()
        lstm1_output = lstm1_saved_output
        lstm2_output = lstm2_saved_output
        lstm1_state = lstm1_saved_state
        lstm2_state = lstm2_saved_state
        #python loop used: tensorflow does not support sequential operations yet
        for i in train_inputs: # having a loop simulates having time
            lstm1_output, lstm1_state = lstm_cell(i, lstm1_output, lstm1_state, lstm1_x, lstm1_m, lstm1_b, 
                                                 num_nodes1)
            lstm2_output, lstm2_state = lstm_cell(lstm1_output, lstm2_output, lstm2_state, lstm2_x, lstm2_m,
                                                  lstm2_b, num_nodes2)
            outputs.append(lstm2_output)

        # State saving across unrollings, control_dependencies makes sure that output and state are computed
        with tf.control_dependencies([lstm1_saved_output.assign(lstm1_output), lstm1_saved_state.assign(lstm1_state), 
                                     lstm2_saved_output.assign(lstm2_output), lstm2_saved_state.assign(lstm2_state)]):
            # Classifier.
            logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf.concat(0, train_labels)),
                                 name='loss')

        # Optimizer.
        global_step = tf.Variable(0, name='global_step')
        learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True, name='learning_rate')
        optimizer = tf.train.GradientDescentOptimizer(learning_rate, name='optimizer')
        print optimizer
        gradients, v = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
        optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

        # Predictions.
        train_prediction = tf.nn.softmax(logits, name='train_prediction')

        # Sampling and validation eval: batch 1, no unrolling.
        sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size], name='sample_input')
        lstm1_saved_sample_output = tf.Variable(tf.zeros([1, num_nodes1]))
        lstm2_saved_sample_output = tf.Variable(tf.zeros([1, num_nodes2]))
        lstm1_saved_sample_state = tf.Variable(tf.zeros([1, num_nodes1]))
        lstm2_saved_sample_state = tf.Variable(tf.zeros([1, num_nodes2]))
        reset_sample_state = tf.group( lstm1_saved_sample_output.assign(tf.zeros([1, num_nodes1])), 
                                      lstm2_saved_sample_output.assign(tf.zeros([1, num_nodes2])),
                                      lstm1_saved_sample_state.assign(tf.zeros([1, num_nodes1])), 
                                      lstm2_saved_sample_state.assign(tf.zeros([1, num_nodes2])),
                                      name='reset_sample_state')
        lstm1_sample_output, lstm1_sample_state = lstm_cell(sample_input, lstm1_saved_sample_output,
                                                             lstm1_saved_sample_state, lstm1_x, lstm1_m, lstm1_b,
                                                            num_nodes1)
        lstm2_sample_output, lstm2_sample_state = lstm_cell(lstm1_sample_output, lstm2_saved_sample_output,
                                                             lstm2_saved_sample_state, lstm2_x, lstm2_m, lstm2_b,
                                                             num_nodes2)
        
        with tf.control_dependencies([lstm1_saved_sample_output.assign(lstm1_sample_output),
                                      lstm2_saved_sample_output.assign(lstm2_sample_output), 
                                     lstm1_saved_sample_state.assign(lstm1_sample_state),
                                      lstm2_saved_sample_state.assign(lstm2_sample_state)]):
            sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(lstm2_sample_output, w, b), name='sample_prediction')

        return g

#test graph
create_lstm_graph_bm(128, 64, 10, 32)

<tensorflow.python.training.gradient_descent.GradientDescentOptimizer object at 0x7efe6ed77190>


<tensorflow.python.framework.ops.Graph at 0x7efe6f603350>

In [21]:
def train(g, num_steps, summary_frequency, num_unrollings, batch_size):
    #initalize batch generators
    train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
    valid_batches = BatchGenerator(valid_text, 1, 1)
    optimizer = g.get_tensor_by_name('optimizer:0')
    #print optimizer
    loss = g.get_tensor_by_name('loss:0')
    train_prediction = g.get_tensor_by_name('train_prediction:0')
    learning_rate = g.get_tensor_by_name('learning_rate:0')
    tf_train_data = g.get_tensor_by_name('tf_train_data:0')
    sample_prediction = g.get_tensor_by_name('sample_prediction:0')
    reset_sample_state = g.get_operation_by_name('reset_sample_state')
    sample_input = g.get_tensor_by_name('sample_input:0')
    with tf.Session(graph=g) as session:
        tf.initialize_all_variables().run()
        print('Initialized')    
        mean_loss = 0
        for step in range(num_steps):
            batches = train_batches.next()            
            _, l, predictions, lr = session.run([optimizer, loss, train_prediction, learning_rate], 
                                                feed_dict={ tf_train_data: batches})
            mean_loss += 1
            if step % summary_frequency == 0:
                if step > 0:
                    mean_loss = mean_loss / summary_frequency
                # The mean loss is an estimate of the loss over the last few batches.
                #print mean_loss
                #print type(mean_loss)
                print 'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr)
                mean_loss = 0
                labels = np.concatenate(list(batches)[1:])
                print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
                if step % (summary_frequency * 10) == 0:
                    # Generate some samples.
                    print('=' * 80)
                    for _ in range(5):
                        feed = sample(random_distribution())
                        sentence = characters(feed)[0]
                        reset_sample_state.run()
                        for _ in range(79):
                            prediction = sample_prediction.eval({sample_input: feed})
                            feed = sample(prediction)
                            sentence += characters(feed)[0]
                        print(sentence)
                    print('=' * 80)
                # Measure validation set perplexity.
                reset_sample_state.run()
                valid_logprob = 0
                for _ in range(valid_size):
                    b = valid_batches.next()
                    predictions = sample_prediction.eval({sample_input: b[0]})
                    valid_logprob = valid_logprob + logprob(predictions, b[1])
                print('Validation set perplexity: %.2f' % float(np.exp(
                    valid_logprob / valid_size)))

In [22]:
g = create_lstm_graph_bm(128, 64, 10, 32)
train(g, 70001, 100, 10, 32)

<tensorflow.python.training.gradient_descent.GradientDescentOptimizer object at 0x7efe6d62f410>
Initialized
Average loss at step 0: 1.000000 learning rate: 10.000000
Minibatch perplexity: 27.04
t lncadi oiml zrag b   tg fxg fau ar koeehcnwnni  sajiarunvgpu   isqm  elwgno bc
zrp nrltgbaa ldzexslageeydpc  vrok x il osiwwtidpegp er tx moi n  exn t pihttp  
eadliavmtrza m kor   ik t ec yznaneoffs ibuoky zfvinm whoo img  o ntm ere ssomeg
qil ge e upd tv hini ltvs gcllg pilf hix selodysdieslarl blxai  idyt ess ktnv i 
ekzn isb ynn  xlaqfn gonjoaubdhmp o it  lcls wngey  shbld uib xonfngoc vtcie etc
Validation set perplexity: 20.94
Average loss at step 100: 1.000000 learning rate: 10.000000
Minibatch perplexity: 14.18
Validation set perplexity: 14.49
Average loss at step 200: 1.000000 learning rate: 10.000000
Minibatch perplexity: 10.27
Validation set perplexity: 11.41
Average loss at step 300: 1.000000 learning rate: 10.000000
Minibatch perplexity: 10.35
Validation set perplexity: 10.14
Avera

KeyboardInterrupt: 