Reset the Kernel before Running for the 2nd time.

In [1]:
import tensorflow as tf
import numpy as np
import codecs
import collections
from six.moves import cPickle
from tensorflow.contrib import rnn
from tensorflow.contrib import legacy_seq2seq

# TRAINING

#### ARGUMENTS

In [2]:
# Choose an input directory name
# Avaliable data-sets are politics, art, daily, charity
dataSet = "politics"
dataDirectory = "data/text/" + dataSet + "/"
sequenceLength = 50
batchSize = 50
# Set file directories
textFile = dataDirectory + "/input.txt"
vocabularyFile = dataDirectory + "/vocab.pkl"
tensorFile = dataDirectory + "/data.npy"

In [3]:
# Read the data from directory
with codecs.open(textFile, "r", encoding='utf-8') as f:
    data = f.read()

## Encoding

In [4]:
# Encode most common char(s) with small integer(s)
# collections.Counter(['a', 'b', 'c', 'a', 'b', 'b'])
# Counter({'b': 3, 'a': 2, 'c': 1})
counter = collections.Counter(data)
# Descending sort according to frequency of characters
countPairs = sorted(counter.items(), key=lambda x: -x[1])
# Get the characters sorted according to the frequencies
chars, _ = zip(*countPairs)
# @ vocabMapper dictionary, store characters as keys and coresponding encodings as values
# Note that the most frequent character is represented with smallest integer
vocabMapper = dict(zip(chars, range(len(chars))))
# Write characters into a vocabularyFile
with open(vocabularyFile, 'wb') as f:
    cPickle.dump(chars, f)
# Encode the data according to vocabMapper
# Save the encoded data as Numpy array
tensor = np.array([vocabMapper[c] for c in data])
np.save(tensorFile, tensor)
# Create Batches
batchCount = int(tensor.size / (batchSize * sequenceLength))
tensor = tensor[:batchCount * batchSize * sequenceLength]
xData = tensor
yData = np.copy(tensor)
yData[:-1] = xData[1:]
yData[-1] = xData[0]
xBatches = np.split(xData.reshape(batchSize, -1), batchCount, 1)
yBatches = np.split(yData.reshape(batchSize, -1), batchCount, 1)

#### ARGUMENTS 

In [5]:
# Choose a directory to save the model to load for text generation part
saveDirectory = "data/model"
layersCount = 2
RNNSize = 128
vocabSize = len(chars)

## Model Creation

In [6]:
# Save chars and vocabMapper that is created for encoding and decoding
charsVocabFile = saveDirectory + "/chars_vocab.pkl"
with open(charsVocabFile, 'wb') as f:
    cPickle.dump((chars, vocabMapper), f)
# Set the cell, it is either RNN, GRU or LSTM
cellFunction = rnn.core_rnn_cell.BasicLSTMCell
# Crete a list contains RNN cells for each layer
cells = list()
for _ in range(layersCount):
    cell = cellFunction(RNNSize)
    cells.append(cell)
# Crate RNN cells according to created list
cell = rnn.MultiRNNCell(cells, state_is_tuple=True)
# Set placeholder for input and target
inputData = tf.placeholder(tf.int32, [batchSize, sequenceLength])
targets = tf.placeholder(tf.int32, [batchSize, sequenceLength])
# Set initial state to all zeros
stateInitial = cell.zero_state(batchSize, tf.float32)
# Set rnnlm scope according to given arguments
with tf.variable_scope('rnnlm'):
    weights = tf.get_variable("weights", [RNNSize, vocabSize])
    biases = tf.get_variable("biases", [vocabSize])
# Create matrix for embedding, inputs and output
embedding = tf.get_variable("embedding", [vocabSize, RNNSize])
inputs = tf.nn.embedding_lookup(embedding, inputData)
inputs = tf.split(inputs, sequenceLength, 1)
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
outputs, lastState = legacy_seq2seq.rnn_decoder(inputs, stateInitial, cell, loop_function=None, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, RNNSize])
logits = tf.matmul(output, weights) + biases
probabilities = tf.nn.softmax(logits)
# Define a cost function
loss = legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(targets, [-1])], 
                                               [tf.ones([batchSize * sequenceLength])])
cost = tf.reduce_sum(loss) / batchSize / sequenceLength
with tf.name_scope('cost'):
    cost = tf.reduce_sum(loss) / batchSize / sequenceLength
finalState = lastState
lr = tf.Variable(0.0, trainable=False)
trainables = tf.trainable_variables()
gradients, _ = tf.clip_by_global_norm(tf.gradients(cost, trainables), 5.0)
# Define an optimizer whihch is AdamOptimizer in this case since it performs well
with tf.name_scope('optimizer'):
    optimizer = tf.train.AdamOptimizer(lr)
# Define the train operation
trainOperation = optimizer.apply_gradients(zip(gradients, trainables))

#### ARGUMENTS

In [7]:
# Define certain arguments before running the session
epochCount = 5
learningRate = 0.002

## Session

In [8]:
# Run the session to train the model
# After training is done save the model for second part
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    for e in range(epochCount):
        sess.run(tf.assign(lr, learningRate * (0.97 ** e)))
        pointer = 0
        state = sess.run(stateInitial)
        for b in range(batchCount):
            x, y = xBatches[pointer], yBatches[pointer]
            pointer += 1
            feed = {inputData: x, targets: y}
            for i, (c, h) in enumerate(stateInitial):
                feed[c] = state[i].c
                feed[h] = state[i].h
            trainLoss, state, _ = sess.run([cost, finalState, trainOperation], feed)
            print("{}/{} - Loss : {:.5f}"
                  .format(e * batchCount + b, epochCount * batchCount, trainLoss))
    checkPointPath = saveDirectory + '/model.ckpt'
    saver.save(sess, checkPointPath, global_step=e * batchCount + b)
    print("model saved to {}".format(checkPointPath))

0/2055 - Loss : 3.96572
1/2055 - Loss : 3.92781
2/2055 - Loss : 3.82444
3/2055 - Loss : 3.58032
4/2055 - Loss : 3.45218
5/2055 - Loss : 3.32520
6/2055 - Loss : 3.28715
7/2055 - Loss : 3.26008
8/2055 - Loss : 3.21284
9/2055 - Loss : 3.19910
10/2055 - Loss : 3.17402
11/2055 - Loss : 3.15935
12/2055 - Loss : 3.18700
13/2055 - Loss : 3.16065
14/2055 - Loss : 3.12547
15/2055 - Loss : 3.17549
16/2055 - Loss : 3.14141
17/2055 - Loss : 3.13615
18/2055 - Loss : 3.14040
19/2055 - Loss : 3.12984
20/2055 - Loss : 3.14347
21/2055 - Loss : 3.16231
22/2055 - Loss : 3.14083
23/2055 - Loss : 3.16093
24/2055 - Loss : 3.13362
25/2055 - Loss : 3.11816
26/2055 - Loss : 3.10279
27/2055 - Loss : 3.10053
28/2055 - Loss : 3.13837
29/2055 - Loss : 3.11414
30/2055 - Loss : 3.10984
31/2055 - Loss : 3.10752
32/2055 - Loss : 3.13675
33/2055 - Loss : 3.10957
34/2055 - Loss : 3.13557
35/2055 - Loss : 3.10604
36/2055 - Loss : 3.11513
37/2055 - Loss : 3.10724
38/2055 - Loss : 3.11633
39/2055 - Loss : 3.11810
40/2055 - 

### Move onto "generateText.ipynb" to  generate text