Process:<br>
1-Load the data<br>
2-Preprocess the data(tokenizing punctualtion, lower case except for names, split)<br>
3-Create dictionary from the words<br>
4-Build and train the model<br>
5-Generate the new text<br>

In [1]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
import string
import warnings
import re
from tensorflow.contrib import legacy_seq2seq


In [2]:
#Load data
f = open("HarryPotterCh1_SorcererStone.txt","r") 
textbook = f.read()
#print(textbook)
f.close()

Here, we create our own function for preprocessing, tokenization as well as creating index of the words.

In [3]:
def preprocess_text(string):
    #Tokenize the punctuations in order to consider them as words
    string = string.replace("\n", " nextline ")
    string = string.replace(".", " periodmark")
    string = string.replace(":", " colonmark")
    string = string.replace(";", " semicolonmark")
    string = string.replace(",", " commamark")
    string = string.replace("?", " questionmark")
    string = string.replace("!", " exclamationmark")
    string = string.replace("...", " 3dots")
    string = string.replace("--", " 2dashes")
    string = string.replace("\"", " quotemark")
    string = string.replace("(", " leftparan")
    string = string.replace(")", " rightparan")
    
    #Names to remain capitalized
    Names = ['Harry', 'Potter', 'James','Jim', 'Abbott','George','Hannah', 'Susan', 'Bones', 'Minerva','McGonagall','Professor',
             'Sprout','Malfoy','Draco','Voldemort','Rubeus','Percy','Snape','Weasley','Hagrid','Fred','Scabbers','Hedwig',
            'Sirius','Hermione','Granger','Ronald','Peeves','Vernon','Dursley','Mrs.','Mr.','Norris','Argus','Filch','Nick','Charlie','Neville',
             'Quirrell','Dumbledore','Filch','Flitwick','McGonagall','McGuffin','Ollivander','Baron','Pomfrey','Gryffindor','Slytherin',
             'Ravenclaw','Hufflepuff']

    toLower = lambda x: " ".join( a if a in Names else a.lower()
            for a in x.split() )

    string= toLower(string)
    return string

In [4]:
text=preprocess_text(textbook)
#print(text)
text_words=text.split()
text_len=len(text_words)

In [5]:
#Create dictionary from list of words in text
def dictionary(words):
    #create list of words without their dupications 
    words=set(words)
    #map word to index
    indx = {key: i for i, key in enumerate(words)}
    return indx


In [6]:
#Convert from index to words
def get_by_key_dict(indx_word,words_dict):
    for word, indx in words_dict.iteritems():    
        if indx == indx_word:
            return word

In [7]:
words_index=dictionary(text_words)
words_index

{"periodmark'\xe2\x80\x9c": 0,
 'wrought-iron': 1,
 'both': 2554,
 'foul': 15,
 'four': 3,
 'woods': 4,
 'spiders': 5,
 'ornate': 24,
 'wizardry': 7,
 'Ronald': 8,
 "fluffy's": 9,
 'lord': 10,
 'flicking': 11,
 'three-thirty': 12,
 'sinking': 13,
 'figg': 14,
 'yellow': 2,
 'bringing': 104,
 'disturb': 17,
 'basics': 18,
 'wooden': 19,
 'wednesday': 20,
 '(except': 21,
 'specially': 22,
 'tired': 23,
 'hanging': 6,
 'bacon': 25,
 'second': 26,
 'crisply': 27,
 'sailed': 28,
 'scraped': 29,
 'iron-gray': 30,
 'thunder': 31,
 'fingers': 32,
 '(how': 33,
 "'smatter": 34,
 'pawed': 35,
 'galleons': 36,
 'hero': 37,
 '-then': 38,
 "norris's": 39,
 'here': 40,
 'reported': 41,
 'ashen-faced': 42,
 'shriek': 43,
 'substance': 265,
 'climbed': 45,
 'reports': 46,
 "i'd": 47,
 'transfixed': 48,
 "i'm": 49,
 'golden': 50,
 'explained': 51,
 'brought': 52,
 'stern': 53,
 'cheating': 54,
 'spoke': 55,
 'music': 56,
 'therefore': 57,
 "wine's": 58,
 'until': 59,
 'relax': 60,
 'hurt': 61,
 'glass':

Create sequences of 10 length (given 10 words as inputs, predict 1 word for output added to the previos words)

In [8]:
seq_len=10

In [9]:
def  create_model_inputs(batch_size):
    '''Define model inputs'''
    
    #Model's placeholders for inputs
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return inputs,targets,keep_prob

In [10]:
def  build_RNN(vocabulary_size,embedding_size,inputs,seq_len,num_hidden,lstm_layer_numbers,keep_prob,batch_size):
    '''Build RNN'''
    #Embedding Layer
    '''Intialize embeddings for the words. Embedding layer connects the words to the LSTM layers (words are embedded to the embedding_size vectors instead of vocabulary size vectors or one hot vectors). Here, provided by tensorflow, we used random_uniform distribution to create embeddings'''
    #tf.AUTO_REUSE for reuisng the same scope for generating as for traning
    with tf.variable_scope('rnn1', reuse=tf.AUTO_REUSE):
        embedding = tf.Variable(tf.random_uniform((vocabulary_size, embedding_size), -1, 1))
        embed = tf.nn.embedding_lookup(embedding, inputs)
        #Define LSTM layers
        lstms=[]
        for i in range(lstm_layer_numbers):
            lstms.append(tf.contrib.rnn.BasicLSTMCell(num_hidden))
        # Add regularization dropout to the LSTM cells
        drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) for lstm in lstms]
        # Stack up multiple LSTM layers
        stacked_lstm = tf.contrib.rnn.MultiRNNCell(drops)
        # Getting the initial state
        initial_state = stacked_lstm.zero_state(batch_size, tf.float32)

        #outputs, final_state = tf.nn.dynamic_rnn(stacked_lstm, embed, initial_state=initial_state)
        #need to unstack the sequence of input into a list of tensors
        seq_input = [tf.squeeze(i,[1]) for i in tf.split(embed,seq_len,1)] 

        outputs, final_state = legacy_seq2seq.rnn_decoder(seq_input, initial_state, stacked_lstm, loop_function=None)

    return initial_state, outputs, final_state

In [11]:
def get_batches(text_words,text_len,seq_len, batch_size,number_of_words_in_one_batch,n_batches):
    '''Using generator to return batches'''
    
    #This makes the input data to be compatible with seq_len
    text_all_batches = text_words[:n_batches*number_of_words_in_one_batch]
    index_text_all_batches=[]
    for i in text_all_batches:
        if i in words_index:
            index_text_all_batches.append(words_index[i])
        
    #index_text_all_batches={v for k,v in words_index.items() if k in text_all_batches}
    #get word index for words for batches
    input_seq=list(index_text_all_batches)
    output_seq=input_seq
    output_seq.append(output_seq.pop(output_seq[0]))
    for ii in range(0, len(text_all_batches), number_of_words_in_one_batch):
        yield input_seq[ii:ii+number_of_words_in_one_batch], output_seq[ii:ii+number_of_words_in_one_batch]

In [12]:
#Define Parameters
# number of units
n_input= len(words_index)
num_hidden = 256
lstm_layer_numbers=3
embed_size=256
batch_size= 256
learning_rate=0.001

Create a graph for training

In [13]:
graph0 = tf.Graph()
#There exits a global default graph created by tenserflow, for new graphs we need to set them as a default graph
with graph0.as_default():
    inputs,targets,keep_prob=create_model_inputs(batch_size)
    initial_state, outputs, final_state = build_RNN(n_input,embed_size,inputs,seq_len,num_hidden,lstm_layer_numbers,keep_prob,batch_size)
    # Loss and optimizer
    logits = tf.contrib.layers.fully_connected(outputs, n_input, activation_fn=None)
    
    probs = tf.nn.softmax(logits, name='probs')
    print(probs.shape)
    cost =  tf.contrib.seq2seq.sequence_loss(
            logits,
            targets,
            tf.ones([batch_size, (seq_len)])    
        )                                   

    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Using gradian clipping for exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients]
    train_op = optimizer.apply_gradients(capped_gradients) 
    
    init_op = tf.global_variables_initializer()
    saver = tf.train.Saver()

#Execute the graph for training
with tf.Session(graph=graph0) as sess:
    sess = tf.Session(graph=graph0)
    sess.run(init_op)
    number_of_words_in_one_batch= seq_len*batch_size
    n_batches = text_len//number_of_words_in_one_batch
    epochs = 5501
    for epoch in range(epochs):
        state = sess.run(initial_state)
        avg_cost_train = 0 
        avg_acc_train= 0
        for ii, (x, y) in enumerate(get_batches(text_words,text_len,seq_len,batch_size,number_of_words_in_one_batch,n_batches), 1):
            #need to reshape y to feed it to targets
            y = np.array(y).reshape(batch_size,(seq_len))
            x = np.array(x).reshape(batch_size,(seq_len))

            state, loss, _= sess.run([final_state, cost,train_op], feed_dict={inputs: x,
                                                            targets: y,keep_prob: 0.8,initial_state: state})

            avg_cost_train += loss / n_batches
        if(epoch%100==0):
            print(epoch)
            print("cost_train=", avg_cost_train) 
    #Save the model into a file 
    checkpoint="./model/savedmodel.ckpt"
    save_path = saver.save(sess, checkpoint)


(10, 256, 6270)
0
('cost_train=', 6.938602598089922)
100
('cost_train=', 4.6028806473079475)
200
('cost_train=', 3.816319227218627)
300
('cost_train=', 3.613472731489884)
400
('cost_train=', 3.4648596424805489)
500
('cost_train=', 3.3783038101698226)
600
('cost_train=', 3.2768038762243168)
700
('cost_train=', 3.2028994121049585)
800
('cost_train=', 3.1575366070396025)
900
('cost_train=', 3.1641998541982543)
1000
('cost_train=', 3.1053596609517147)
1100
('cost_train=', 3.0415823145916594)
1200
('cost_train=', 2.9972305548818485)
1300
('cost_train=', 3.0092376094115414)
1400
('cost_train=', 2.9771042936726619)
1500
('cost_train=', 2.9253121300747522)
1600
('cost_train=', 2.939032611094023)
1700
('cost_train=', 2.922185113555507)
1800
('cost_train=', 2.9919576770380925)
1900
('cost_train=', 2.9595746617568164)
2000
('cost_train=', 2.8673423842379928)
2100
('cost_train=', 2.8765910675651156)
2200
('cost_train=', 2.966701865196228)
2300
('cost_train=', 2.8635740091926176)
2400
('cost_train=

In [14]:
batch_size=1
#use the same sequence length as for trained model to generate the new words
seq_len=10

Craete a graph for generating text<br>
Based on the train model, each time it uses 10 previous words to generate the next word (therfore, first define 10 prime words to begin generating 11th word and then consider 2th to 11th words for generating the 12th word and so on)

In [15]:
tf.reset_default_graph()
graph1 = tf.Graph()

with graph1.as_default():
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    inputs = tf.placeholder(tf.int32, [batch_size, None], name='inputs')
    initial_state, outputs, final_state = build_RNN(n_input,embed_size,inputs,seq_len,num_hidden,lstm_layer_numbers,keep_prob,batch_size)
    logits = tf.contrib.layers.fully_connected(outputs, n_input, activation_fn=None)
    probs = tf.nn.softmax(logits, name='probs')
    init_op1 = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
#Execute the graph1 for generating text
with tf.Session(graph=graph1) as sess2:
    #This part to compare varibles in checkpoints with what we have
    var_name_list = [v.name for v in tf.trainable_variables()]
    #print(var_name_list)
    from tensorflow.python import pywrap_tensorflow

    reader = pywrap_tensorflow.NewCheckpointReader(checkpoint)
    var_to_shape_map = reader.get_variable_to_shape_map()
    #print(var_to_shape_map)
    
    #Execute the graph to generate the text
    sess2.run(init_op1)
    
    #Number of words to generate 
    num_gen=1000
    
    # Load the model
    saved = tf.train.import_meta_graph('./model/savedmodel.ckpt.meta')
    saved.restore(sess2, checkpoint)
    
    #Just for the record 
    saved_dict = {}
    for x in tf.trainable_variables():
          saved_dict[x.name] = x
    
    #10 first words to begin with
    start_word="Harry Potter went to see the street even it was"
    start_words=start_word.split(" ")
    print(start_words)
    
    #The sentence of text we have so far as a list of words' indexes
    genertaed_sentence=[words_index[w] for w in start_words]
    genertaed_sentence=[genertaed_sentence]
    
    state = sess2.run(initial_state)
    #Choose the last 10 words we have from text to predict the next word
    last_words=[(genertaed_sentence[0])[-10:]]
    
    
    for i in range(0,num_gen):
        #seq_len is 10
        seq_len=len(last_words[0])
        
        #print("last_words",last_words)
        
        next_word = np.zeros((1,seq_len))
        next_word = [w for w in last_words]
        #print(np.array(next_word).shape,inputs.shape,type(next_word),next_word)
        next_word = np.asarray(next_word) 

        #next_word = next_word.reshape(batch_size,seq_len)
            
        #print("next_word",next_word)
         #next_word = np.array(next_word).reshape(batch_size,(seq_len))
        prediction,state= sess2.run([probs,final_state], feed_dict={inputs: next_word,
                                                                    keep_prob: 0.8,initial_state: state})
        #print("Prediction's shape",prediction.shape," Prediction:",prediction)
        #print("Element we choose for prediction: ",prediction[len(last_words[0])-1,0])
        #Based on prediction's shape still not sure about part len(last_words[0])-1, which element to choose
        
        #Next predicted word by choosing the word with max probability
        next_predicted_word = np.argmax(prediction[len(last_words[0])-1,0])
        
        #append the new word to the previous sentences
        genertaed_sentence[0].append(next_predicted_word)
        #save in last_word to use it in for loop
        last_words=[(genertaed_sentence[0])[-10:]]

#Conver index to words
list_gen=[get_by_key_dict(word_int,words_index) for word_int in genertaed_sentence[0]]
sen=' '.join(list_gen)
#Convert back the tokens for punctuations
sen=sen.replace("nextline", "\n")
sen=sen.replace("periodmark", ".")
sen=sen.replace("colonmark", ":")
sen=sen.replace("commamark", ",")
sen=sen.replace("semicommamark", ";")
sen=sen.replace("questionmark", "?")
sen=sen.replace("exclamationmark", "!")
sen=sen.replace("3dots", "...")
sen=sen.replace("quotemark", "\"")
sen=sen.replace("leftparan", "(")
sen=sen.replace("rightparan", ")")
#Print the whole text
print(sen)

INFO:tensorflow:Restoring parameters from ./model/savedmodel.ckpt
['Harry', 'Potter', 'went', 'to', 'see', 'the', 'street', 'even', 'it', 'was']
Harry Potter went to see the street even it was however went the find was for the that , down boys . . . give . seven have . . grunt pile 
 door relief , mirror , . , . strong , . , open . and . white want , . no . head air . . about and . one 
 coming except . . it minute felt , . , time . want behind said . , , . have . , , few speed ones Harry , the very . reached . except started said 
 , to usual youngest . not , only money . plant , , quidditch . light , don't . . . . , lot and fell get off nose wanted and insisting sorry . . us . other neville , what curiously . drew however your going , , . like . expelled have a what muffled . . , , give 
 who , , , 
 golden disappeared the . . there remember extra 
 stolen won . 
 , 
 is 
 
 mark 
 reason in before 
 . . 
 . , , it , . ron's . , knuts move 
 , he tune go , you'll 
 
 . . us . almost 