In [1]:
# Import dependencies

import numpy as np # vectorization
import random # generating text
import tensorflow as tf # ML
import datetime # clock training time

In [2]:
# Import the book

book = tf.keras.utils.get_file('iceAndFire', 'https://raw.githubusercontent.com/nihitx/game-of-thrones-/master/gameofthrones.txt')
text = open(book, 'rb').read().decode(encoding='utf-8') # open(file, mode = rb ('r' means open for reading, 'b' means in binary mode. When opening file in bonaru mode, non-ASCII characters, newlines are not transformed. Tldr: it doesn't harm the data))
print('\n Text length in number of characters: ', len(text))
print('\n First 1000 characters of the book: \n', text[:1000])


 Text length in number of characters:  5662324

 First 1000 characters of the book: 
 

“We should start back,” Gared urged as the woods began to grow dark around them. “The wildlings are dead.”

“Do the dead frighten you?” Ser Waymar Royce asked with just the hint of a smile.

Gared did not rise to the bait. He was an old man, past fifty, and he had seen the lordlings come and go. “Dead is dead,” he said. “We have no business with the dead.”

“Are they dead?” Royce asked softly. “What proof have we?”

“Will saw them,” Gared said. “If he says they are dead, that’s proof enough for me.”

Will had known they would drag him into the quarrel sooner or later. He wished it had been later rather than sooner. “My mother told me that dead men sing no songs,” he put in.

“My wet nurse said the same thing, Will,” Royce replied. “Never believe anything you hear at a woman’s tit. There are things to be learned even from the dead.” His voice echoed, too loud in the twilit forest.

“We have a long r

In [3]:
# Sort the unique characters used and print them

chars = sorted(list(set(text)))
char_size = len(chars)
print('Number of characters: ', char_size)
print(chars)

Number of characters:  86
['\n', ' ', '!', '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', 'é', 'ê', '—', '‘', '’', '“', '”', '…']


In [4]:
# Convert charactes to id and vice-versa in a dictionary. Basically, indexing the characters as key:value pair

char2id = dict((c, i) for i, c in enumerate(chars))
id2char = dict((i, c) for c, i in enumerate(chars))

In [5]:
# Generate the probability of each next character
def sample(prediction): # prediction is the list of possible characters. This method will return the most likely character
    r = random.uniform(0, 1)
    s = 0 # store prediction character
    char_id = len(prediction) - 1
    # For each character prediction probability
    for i in range(len(prediction)):
        s += prediction[i]
        if s >= r:
            char_id = i
            break
    
    # One hot encoding the probable characters. One hot encoding is to differentiate, not rank values. i.e 00001000
    char_one_hot = np.zeros(shape[char_size]) # Return a new array of given shape and type, filled with zeros.
    char_one_hot[char_id] = 1.0 # set the value of probable character to 1
    return char_one_hot # returns something like 0000010000

In [6]:
# Vectorize our data to feed it into the model

len_per_section = 50 # length perception is the size of batch which we'll feed into the model. In this case, 50 character long batches.
skip = 10 # Fill the section list with chunks of characters. After the first batch, skip the first 10 character and create a new batch.
         # First batch: Hello World.
         # Second batch: llo World. I ....and so on.
         # Can be 'skipped' if there's a lot of text
sections = []
next_chars = []


for i in range(0, len(text) -len_per_section, skip):
    sections.append(text[i: i + len_per_section])
    next_chars.append(text[i + len_per_section])
    
# Vectorize input and output

# Matrix of section lenght by num of characters
x = np.zeros((len(sections), len_per_section, char_size))
# label column for all the character id's still zero
y = np.zeros((len(sections), char_size))

# for each char in each section, convert each char to an ID
# for each section, convert the labels to ids
for i, section in enumerate(sections):
    for j, char in enumerate(section):
        x[i, j, char2id[char]] = 1
    y[i, char2id[next_chars[i]]] = 1
print(y)


[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [7]:
# Machine learning time

batch_size = 512 # batch size is a number of training samples. Higher the batch size, higher the memory usage.
max_steps = 70000
log_every = 100
save_every = 6000
hidden_nodes = 1024

test_start = 'You know nothing Jon Snow.'

# save our model
checkpoint_directory = 'ckpt'

# Create a checkpoint directory
if tf.gfile.Exists(checkpoint_directory):
    tf.gfile.DeleteRecursively(checkpoint_directory)
tf.gfile.MakeDirs(checkpoint_directory)

print('training data size: ', len(x))
print('approximate steps per epoch: ', int(len(x)/batch_size))


training data size:  566228
approximate steps per epoch:  1105


In [8]:
# build our model
graph = tf.Graph()
with graph.as_default():
    
    global_step = tf.Variable(0) # global step refers to the number of batches that are seen by the graph.
    
    data = tf.placeholder(tf.float32, [batch_size, len_per_section, char_size]) # tensor size is 3 dimensonal. This is the data we are going to feed into our model.
    labels = tf.placeholder(tf.float32, [batch_size, char_size])
    
# Input gate, output gate, forget gate, internal state
# They will be calculated in vacuums
    
    # Input gate - weights for input, previous output and bias
    w_ii = tf.Variable(tf.truncated_normal ([char_size, hidden_nodes], -0.1, 0.1))
    w_io = tf.Variable(tf.truncated_normal ([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_i = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    # Forget gate
    w_fi = tf.Variable(tf.truncated_normal ([char_size, hidden_nodes], -0.1, 0.1))
    w_fo = tf.Variable(tf.truncated_normal ([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_f = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    # Output gate
    w_oi = tf.Variable(tf.truncated_normal ([char_size, hidden_nodes], -0.1, 0.1))
    w_oo = tf.Variable(tf.truncated_normal ([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    # Memory cell (hidden state)
    w_ci = tf.Variable(tf.truncated_normal ([char_size, hidden_nodes], -0.1, 0.1))
    w_co = tf.Variable(tf.truncated_normal ([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_c = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    def lstm(i, o, state):
        
        # These are all calculated seperately, no overlap until....
        # bias prevents over-fitting
        # (input * input weights) + (output * weights for previous output) + bias
        input_gate = tf.sigmoid(tf.matmul(i, w_ii) + tf.matmul(o, w_io) + b_i)
        
        # (input * forget weights) + (output * weights for previous output) + bias
        forget_gate = tf.sigmoid(tf.matmul(i, w_fi) + tf.matmul(o, w_fo) + b_f)
        
        # (input * output weights) + (output * weights for previous output) + bias
        output_gate = tf.sigmoid(tf.matmul(i, w_oi) + tf.matmul(o, w_oo) + b_o)
        
        # (input * internal state weights) + (output * weights for previous output) + bias
        memory_cell = tf.sigmoid(tf.matmul(i, w_ci) + tf.matmul(o, w_co) + b_c)
        
        # ...now! Multiply forget gate * given state    +  input gate * hidden state
        state = forget_gate * state + input_gate * memory_cell
        # squash that state with tanh nonlin (Computes hyperbolic tangent of x element-wise)
        
        # multiply by output
        output = output_gate * tf.tanh(state)
        # return 
        return output, state
    
    ###########
    #Operation
    ###########
    #LSTM
    #both start off as empty, LSTM will calculate this
    output = tf.zeros([batch_size, hidden_nodes])
    state = tf.zeros([batch_size, hidden_nodes])

    #unrolled LSTM loop
    #for each input set
    for i in range(len_per_section):
        #calculate state and output from LSTM
        output, state = lstm(data[:, i, :], output, state)
        #to start, 
        if i == 0:
            #store initial output and labels
            outputs_all_i = output
            labels_all_i = data[:, i+1, :]
        #for each new set, concat outputs and labels
        elif i != len_per_section - 1:
            #concatenates (combines) vectors along a dimension axis, not multiply
            outputs_all_i = tf.concat([outputs_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, data[:, i+1, :]], 0)
        else:
            #final store
            outputs_all_i = tf.concat([outputs_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, labels], 0)
        
    #Classifier
    #The Classifier will only run after saved_output and saved_state were assigned.
    
    #calculate weight and bias values for the network
    #generated randomly given a size and distribution
    w = tf.Variable(tf.truncated_normal([hidden_nodes, char_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([char_size]))
    #Logits simply means that the function operates on the unscaled output 
    #of earlier layers and that the relative scale to understand the units 
    #is linear. It means, in particular, the sum of the inputs may not equal 1, 
    #that the values are not probabilities (you might have an input of 5).
    logits = tf.matmul(outputs_all_i, w) + b
    
    #logits is our prediction outputs, lets compare it with our labels
    #cross entropy since multiclass classification
    #computes the cost for a softmax layer
    #then Computes the mean of elements across dimensions of a tensor.
    #average loss across all values
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels_all_i, logits = logits))

    #Optimizer
    #minimize loss with graident descent, learning rate 10,  keep track of batches
    optimizer = tf.train.GradientDescentOptimizer(10.).minimize(loss, global_step=global_step)
    
    ###########
    #Test
    ###########
    #test_data = tf.placeholder(tf.float32, shape=[1, char_size])
    #test_output = tf.Variable(tf.zeros([1, hidden_nodes]))
    #test_state = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    #Reset at the beginning of each test
    #reset_test_state = tf.group(test_output.assign(tf.zeros([1, hidden_nodes])), 
                                #test_state.assign(tf.zeros([1, hidden_nodes])))

    #LSTM
    #test_output, test_state = lstm(test_data, test_output, test_state)
    #test_prediction = tf.nn.softmax(tf.matmul(test_output, w) + b)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [None]:
#timew to train the model, initialize a session with a graph
with tf.Session(graph=graph) as sess:
    #standard init step
    tf.global_variables_initializer().run()
    offset = 0
    saver = tf.train.Saver()
    
    #for each training step
    for step in range(max_steps):
        
        #starts off as 0
        offset = offset % len(x)
        
        #calculate batch data and labels to feed model iteratively
        if offset <= (len(x) - batch_size):
            #first part
            batch_data = x[offset: offset + batch_size]
            batch_labels = y[offset: offset + batch_size]
            offset += batch_size
        #until when offset  = batch size, then we 
        else:
            #last part
            to_add = batch_size - (len(x) - offset)
            batch_data = np.concatenate((x[offset: len(x)], x[0: to_add]))
            batch_labels = np.concatenate((y[offset: len(x)], y[0: to_add]))
            offset = to_add
        
        #optimize!!
        _, training_loss = sess.run([optimizer, loss], feed_dict={data: batch_data, labels: batch_labels})
        
        if step % 10 == 0:
            print('training loss at step %d: %.2f (%s)' % (step, training_loss, datetime.datetime.now()))

            if step % save_every == 0:
                saver.save(sess, checkpoint_directory + '/model', global_step=step)

training loss at step 0: 4.49 (2019-04-16 22:21:22.071485)
training loss at step 10: 4.29 (2019-04-16 22:22:07.153337)
training loss at step 20: 4.54 (2019-04-16 22:22:54.300467)
training loss at step 30: 3.25 (2019-04-16 22:23:39.741818)
training loss at step 40: 4.19 (2019-04-16 22:24:25.853301)
training loss at step 50: 3.32 (2019-04-16 22:25:10.793697)
training loss at step 60: 3.23 (2019-04-16 22:25:56.310326)
training loss at step 70: 3.12 (2019-04-16 22:26:43.853531)
training loss at step 80: 3.05 (2019-04-16 22:27:36.554555)
training loss at step 90: 3.00 (2019-04-16 22:28:27.076167)
training loss at step 100: 3.04 (2019-04-16 22:29:13.600818)
training loss at step 110: 3.48 (2019-04-16 22:30:02.959250)
training loss at step 120: 3.14 (2019-04-16 22:30:51.094072)
training loss at step 130: 2.92 (2019-04-16 22:31:39.439409)
training loss at step 140: 2.98 (2019-04-16 22:32:26.334369)
training loss at step 150: 2.98 (2019-04-16 22:33:13.602621)
training loss at step 160: 3.01 (20

training loss at step 1340: 2.87 (2019-04-17 00:08:03.371093)
training loss at step 1350: 2.81 (2019-04-17 00:08:46.316928)
training loss at step 1360: 2.82 (2019-04-17 00:09:28.866232)
training loss at step 1370: 2.80 (2019-04-17 00:10:11.386633)
training loss at step 1380: 2.80 (2019-04-17 00:10:54.084870)
training loss at step 1390: 2.76 (2019-04-17 00:11:36.594397)
training loss at step 1400: 2.88 (2019-04-17 00:12:19.166130)
training loss at step 1410: 2.84 (2019-04-17 00:13:01.787075)
training loss at step 1420: 2.79 (2019-04-17 00:13:44.204699)
training loss at step 1430: 2.82 (2019-04-17 00:14:25.699464)
training loss at step 1440: 2.84 (2019-04-17 00:15:07.267175)
training loss at step 1450: 2.85 (2019-04-17 00:15:48.695329)
training loss at step 1460: 2.82 (2019-04-17 00:16:30.181141)
training loss at step 1470: 2.80 (2019-04-17 00:17:11.514287)
training loss at step 1480: 2.83 (2019-04-17 00:17:52.907983)
training loss at step 1490: 2.81 (2019-04-17 00:18:34.570734)
training

training loss at step 2670: 2.39 (2019-04-17 01:43:16.148788)
training loss at step 2680: 2.33 (2019-04-17 01:43:59.269950)
training loss at step 2690: 2.37 (2019-04-17 01:44:41.977858)
training loss at step 2700: 2.35 (2019-04-17 01:45:24.528325)
training loss at step 2710: 2.41 (2019-04-17 01:46:07.209511)
training loss at step 2720: 2.33 (2019-04-17 01:46:49.482929)
training loss at step 2730: 2.39 (2019-04-17 01:47:31.889339)
training loss at step 2740: 2.35 (2019-04-17 01:48:14.213856)
training loss at step 2750: 2.32 (2019-04-17 01:48:57.165846)
training loss at step 2760: 2.32 (2019-04-17 01:49:39.767297)
training loss at step 2770: 2.27 (2019-04-17 01:50:22.228122)
training loss at step 2780: 2.29 (2019-04-17 01:51:04.967657)
training loss at step 2790: 2.23 (2019-04-17 01:51:48.351557)
training loss at step 2800: 2.30 (2019-04-17 01:52:31.698354)
training loss at step 2810: 2.35 (2019-04-17 01:53:15.329004)
training loss at step 2820: 2.30 (2019-04-17 01:53:58.193092)
training

training loss at step 4000: 2.04 (2019-04-17 03:18:17.868460)
training loss at step 4010: 2.52 (2019-04-17 03:19:00.594529)
training loss at step 4020: 2.08 (2019-04-17 03:19:43.221810)
training loss at step 4030: 2.05 (2019-04-17 03:20:26.211202)
training loss at step 4040: 2.04 (2019-04-17 03:21:09.133040)
training loss at step 4050: 2.06 (2019-04-17 03:21:52.077575)
training loss at step 4060: 2.01 (2019-04-17 03:22:34.810900)
training loss at step 4070: 1.95 (2019-04-17 03:23:17.252294)
training loss at step 4080: 2.08 (2019-04-17 03:24:00.304726)
training loss at step 4090: 2.07 (2019-04-17 03:24:42.688812)
training loss at step 4100: 2.02 (2019-04-17 03:25:25.052516)
training loss at step 4110: 1.92 (2019-04-17 03:26:07.310961)
training loss at step 4120: 2.01 (2019-04-17 03:26:49.472574)
training loss at step 4130: 1.98 (2019-04-17 03:27:31.974698)
training loss at step 4140: 2.07 (2019-04-17 03:28:14.442249)
training loss at step 4150: 2.02 (2019-04-17 03:28:57.029377)
training

training loss at step 5330: 1.86 (2019-04-17 04:53:06.404938)
training loss at step 5340: 1.95 (2019-04-17 04:53:49.044387)
training loss at step 5350: 1.86 (2019-04-17 04:54:31.695633)
training loss at step 5360: 1.97 (2019-04-17 04:55:14.322590)
training loss at step 5370: 1.94 (2019-04-17 04:55:56.835950)
training loss at step 5380: 1.81 (2019-04-17 04:56:39.483012)
training loss at step 5390: 1.89 (2019-04-17 04:57:22.052859)
training loss at step 5400: 1.93 (2019-04-17 04:58:04.484885)
training loss at step 5410: 1.88 (2019-04-17 04:58:47.464962)
training loss at step 5420: 1.93 (2019-04-17 04:59:30.005730)
training loss at step 5430: 1.89 (2019-04-17 05:00:12.646561)
training loss at step 5440: 1.85 (2019-04-17 05:00:55.165261)
training loss at step 5450: 1.91 (2019-04-17 05:01:37.658106)
training loss at step 5460: 1.90 (2019-04-17 05:02:20.498536)
training loss at step 5470: 1.90 (2019-04-17 05:03:03.343407)
training loss at step 5480: 1.86 (2019-04-17 05:03:46.119934)
training

training loss at step 6660: 1.84 (2019-04-17 06:27:57.975708)
training loss at step 6670: 1.69 (2019-04-17 06:28:40.931265)
training loss at step 6680: 1.82 (2019-04-17 06:29:23.286873)
training loss at step 6690: 1.79 (2019-04-17 06:30:05.927607)
training loss at step 6700: 1.79 (2019-04-17 06:30:48.719397)
training loss at step 6710: 1.75 (2019-04-17 06:31:31.511311)
training loss at step 6720: 1.83 (2019-04-17 06:32:14.955702)
training loss at step 6730: 1.81 (2019-04-17 06:32:57.865542)
training loss at step 6740: 1.83 (2019-04-17 06:33:40.511309)
training loss at step 6750: 1.73 (2019-04-17 06:34:23.192262)
training loss at step 6760: 1.78 (2019-04-17 06:35:06.007751)
training loss at step 6770: 1.82 (2019-04-17 06:35:48.597230)
training loss at step 6780: 1.83 (2019-04-17 06:36:31.350163)
training loss at step 6790: 1.76 (2019-04-17 06:37:14.249483)
training loss at step 6800: 1.79 (2019-04-17 06:37:56.893928)
training loss at step 6810: 1.77 (2019-04-17 06:38:40.323027)
training

In [None]:

test_start = 'You know nothing Jon Snow '

with tf.Session(graph=graph) as sess:
    #init graph, load model
    tf.global_variables_initializer().run()
    model = tf.train.latest_checkpoint(checkpoint_directory)
    saver = tf.train.Saver()
    saver.restore(sess, model)

    #set input variable to generate chars from
    reset_test_state.run() 
    test_generated = test_start

    #for every char in the input sentennce
    for i in range(len(test_start) - 1):
        #initialize an empty char store
        test_X = np.zeros((1, char_size))
        #store it in id from
        test_X[0, char2id[test_start[i]]] = 1.
        #feed it to model, test_prediction is the output value
        _ = sess.run(test_prediction, feed_dict={test_data: test_X})

    
    #where we store encoded char predictions
    test_X = np.zeros((1, char_size))
    test_X[0, char2id[test_start[-1]]] = 1.

    #lets generate 500 characters
    for i in range(500):
        #get each prediction probability
        prediction = test_prediction.eval({test_data: test_X})[0]
        #one hot encode it
        next_char_one_hot = sample(prediction)
        #get the indices of the max values (highest probability)  and convert to char
        next_char = id2char[np.argmax(next_char_one_hot)]
        #add each char to the output text iteratively
        test_generated += next_char
        #update the 
        test_X = next_char_one_hot.reshape((1, char_size))

    print(test_generated)