Data set: [wikidata](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)

In [1]:
# import libs 
import tensorflow as tf 

# helper libs 
import numpy as np # matrix maths 
import random # for randomness 

In [2]:
# load dataset 

text = open('./wikitext-2/wiki.train.tokens', 'rb').read().decode('utf-8')
print("Lenght of text".format(len(text)))

# print first 1000 chars of test and train 
print("Text :", text[:1000])

Lenght of text
Text :  
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for s

In [3]:
# list all the unique chars for the list 
chars = sorted(list(set(text)))
chars_size = len(chars)

# print lenght 
print("Lenght of chars", chars_size)
print(chars)

Lenght of chars 283
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '~', '¡', '£', '¥', '§', '°', '±', '²', '³', 'µ', '·', '½', 'Á', 'Å', 'Æ', 'É', 'Í', 'Î', 'Ö', '×', 'Ø', 'Ú', 'Ü', 'Þ', 'à', 'á', 'â', 'ã', 'ä', 'å', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ø', 'ú', 'û', 'ü', 'Ā', 'ā', 'ă', 'ć', 'č', 'Đ', 'đ', 'ė', 'ī', 'Ł', 'ł', 'ń', 'Ō', 'ō', 'ś', 'ş', 'š', 'ū', 'ų', 'Ż', 'ž', 'ơ', 'ư', 'ʻ', 'ʿ', '̃', 'α', 'β', 'γ', 'κ', 'μ', 'С', 'а', 'в', 'е', 'к', 'о', 'с', 'т', 'я', 'ا', 'ح', 'ص', 'ل', 'ن', 'ه', '्', 'ก', 'ง', 'ณ', 'ต', 'ม', 'ย', 'ร', 'ล', 'ั', 'า

In [4]:
# chars to id mappind 
chars2id = dict((c, i) for i, c in enumerate(chars))
id2chars = dict((i, c) for i, c in enumerate(chars))

In [5]:
# def a helper function to generate probablities of next chars 
def sample(prediction):
    r = random.uniform(0,1)
    s = 0
    char_id = len(prediction)-1
    
    for i in range(len(prediction)):
        s += prediction[i]
        
        if s >= r:
            char_id = i
            break
    char_one_hot = np.zeros(shape=[chars_size])
    char_one_hot[char_id] = 1.0
    
    return char_one_hot

In [6]:
# vectorize our data 
len_per_section = 50
skip = 50
sections = []
next_chars = []

for i in range(0, 10000*100, skip):
    sections.append(text[i: i + len_per_section])
    next_chars.append(text[i + len_per_section])
    

# vectorize our chars 
X = np.zeros((len(sections), len_per_section, chars_size))
y = np.zeros((len(sections), chars_size))

for i, section in enumerate(sections):
    for j, char in enumerate(section):
        X[i, j, chars2id[char]] = 1
    y[i, chars2id[next_chars[i]]] = 1
print("X :", X)
print("y:", y)

X : [[[0. 1. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
y: [[0

In [7]:
# Deep Learning Part 
batch_size = 512
num_epochs = 1000
log_step = 100
save_model = 200
hidden_nodes = 1024
# start string for the test 
test_start = 'I am thinking'

checkpoint_dir = "ckpt"
# check if our model checkpoint exits
if tf.gfile.Exists(checkpoint_dir):
    tf.gfile.DeleteRecursively(checkpoint_dir)
tf.gfile.MakeDirs(checkpoint_dir)

print("Training data size ", len(X))
print("Approx steps per training epoch", len(X) // batch_size )

Training data size  20000
Approx steps per training epoch 39


In [25]:
# Start building our model 
graph = tf.Graph()

with graph.as_default():
    global_step = tf.Variable(0)

    # placeholder 
    data = tf.placeholder(tf.float32, [batch_size, len_per_section, chars_size])
    labels = tf.placeholder(tf.float32, [batch_size, chars_size])

    # defining all the gates 
    with tf.name_scope("input_gate"):
        w_ii = tf.Variable(tf.truncated_normal([chars_size, hidden_nodes], stddev=0.1), name="Input_Weights")
        w_io = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], stddev=0.1), name="Output_Weights")
        b_i = tf.Variable(tf.truncated_normal([1, hidden_nodes], stddev=0.1), name="Input_Biases")

    with tf.name_scope("forget_gate"):
        w_fi = tf.Variable(tf.truncated_normal([chars_size, hidden_nodes], stddev=0.1), name="Input_Weights")
        w_fo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], stddev=0.1), name="Output_Weights")
        b_f = tf.Variable(tf.truncated_normal([1, hidden_nodes], stddev=0.1), name="Forget_Biases")

    with tf.name_scope("output_gate"):
        w_oi = tf.Variable(tf.truncated_normal([chars_size, hidden_nodes], stddev=0.1), name="Input_Weights")
        w_oo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], stddev=0.1), name="Output_Weights")
        b_o = tf.Variable(tf.truncated_normal([1, hidden_nodes], stddev=0.1), name="Output_Biases")

    with tf.name_scope("memory_gate"):
        w_ci = tf.Variable(tf.truncated_normal([chars_size, hidden_nodes], stddev=0.1), name="Input_Weights")
        w_co = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], stddev=0.1), name="Output_Weights")
        b_c = tf.Variable(tf.truncated_normal([1, hidden_nodes], stddev=0.1), name="Memory_Biases")
        
    # helper function for the lstm cell 
    def lstm(i, o, state):
        input_gate = tf.sigmoid(tf.matmul(i, w_ii) + tf.matmul(o, w_io) + b_i)

        forget_gate = tf.sigmoid(tf.matmul(i, w_fi) + tf.matmul(o, w_fo) + b_f)

        output_gate = tf.sigmoid(tf.matmul(i, w_oi) + tf.matmul(o, w_oo) + b_o)

        memory_cell = tf.sigmoid(tf.matmul(i, w_ci) + tf.matmul(o, w_co) + b_c)


        state = forget_gate * state + memory_cell + input_gate

        output = output_gate * tf.tanh(state)

        return output, state
    
    output = tf.zeros([batch_size, hidden_nodes])
    state = tf.zeros([batch_size, hidden_nodes])
    
#     # unroll lstm layer 
#     for i in range(len_per_section):
#         output, state = lstm(data[:, i, :], output, state)
        
#         if i == 0:
#             output_all_i = output
#             labels_all_i = data[:, i+1, :]
#         output = tf.zeros([batch_size, hidden_nodes])
#     state = tf.zeros([batch_size, hidden_nodes])

    #unrolled LSTM loop
    #for each input set
    for i in range(len_per_section):
        #calculate state and output from LSTM
        output, state = lstm(data[:, i, :], output, state)
        #to start, 
        if i == 0:
            #store initial output and labels
            outputs_all_i = output
            labels_all_i = data[:, i+1, :]
        #for each new set, concat outputs and labels
        elif i != len_per_section - 1:
            #concatenates (combines) vectors along a dimension axis, not multiply
            outputs_all_i = tf.concat(axis=0, values=[outputs_all_i, output])
            labels_all_i = tf.concat(axis=0, values=[labels_all_i, data[:, i+1, :]])
        else:
            #final store
            outputs_all_i = tf.concat(axis=0, values=[outputs_all_i, output])
            labels_all_i = tf.concat(axis=0, values=[labels_all_i, labels])
            
    #Classifier
    w = tf.Variable(tf.truncated_normal([hidden_nodes, chars_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([chars_size]))
    
    logits = tf.matmul(outputs_all_i, w) + b
    
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_all_i))

    #Optimizer
    #minimize loss with graident descent, learning rate 10,  keep track of batches
    optimizer = tf.train.GradientDescentOptimizer(10.).minimize(loss, global_step=global_step)

In [None]:
with tf.Session(graph=graph) as sess:
    #standard init step
    tf.global_variables_initializer().run()
    offset = 0
    saver = tf.train.Saver()
    
    #for each training step
    for step in range(num_epochs):
        
        #starts off as 0
        offset = offset % len(X)
        
        #calculate batch data and labels to feed model iteratively
        if offset <= (len(X) - batch_size):
            #first part
            batch_data = X[offset: offset + batch_size]
            batch_labels = y[offset: offset + batch_size]
            offset += batch_size
        #until when offset  = batch size, then we 
        else:
            #last part
            to_add = batch_size - (len(X) - offset)
            batch_data = np.concatenate((X[offset: len(X)], X[0: to_add]))
            batch_labels = np.concatenate((y[offset: len(X)], y[0: to_add]))
            offset = to_add
        
        #optimize!!
        _, training_loss = sess.run([optimizer, loss], feed_dict={data: batch_data, labels: batch_labels})
        
        if step % 10 == 0:
            print('training loss at step %d: %.2f' % (step, training_loss))

training loss at step 0: 6.64
