In [2]:
# This is a pretty straight forward implementation of character level 
# text generator model. I this imlementation I am going to observe the
# memory feature of Recurrent Neural Network with GRU Cells. 

In [1]:
import tensorflow as tf
import re
import collections
import numpy as np

In [2]:
with open('text_data/science.txt', 'r') as f:
    text= f.read().lower()

# Vocabulary is a collection all the possible character that the text has for examples alphabets, numerics, punctuations etc..

In [3]:
vocab= set(text)

# char2id is a dictionary to map each characters to a unique numeric Id whcih could be fed to the model, and id2char is a dictionary to map a numeric Id to a unique character which is useful to generate characters form the model predictions. 

In [4]:
char2id= dict((c,i) for i,c in enumerate(vocab))
id2char= dict((i,c) for i,c in enumerate(vocab))

In [3]:
# a function to convert list of given characters to Ids.
# function excepts the list of characters and returns the list
# of ids and the character which is not available is the vocabulary
# has a unique id 0.
def id_char(char):
    vec= np.zeros(shape= (len(char),1), dtype= float)
    for i,c in enumerate(char):
        try:
            j= char2id[c]
            vec[i]= j
        except KeyError:
            pass
        
    return vec

# one hot enoding the list of characters
def vectorize_char(char):
    vec= np.zeros(shape= (len(char),len(vocab)), dtype= float)
    for i,c in enumerate(char):
        try:
            j= char2id[c]
            vec[i,j]= 1
        except KeyError:
            pass
        
    return vec

# this function accepts the list of one hot vectors and returns
# the list of crresponding characters. <UKN> is a token for the
# characters not available in the vocabulary
def vector_characterize(vecs):
    chars= ['<UKN>']*vecs.shape[0]
    for i,j in enumerate(np.where(vecs==1)[1]):
        try:
            chars[i]= id2char[j]
        except KeyError:
            pass
        
    return chars

# Modelling the architecture

In [4]:
# this model has 2 hidden layers each with 256 GRU Cells

# The Rnn is truncated at every 100th time step i.e. the network
# has the memory of last 100 time steps and predicts the next character
# on the basis of that.

# the id corresponding to each character is fed to the network
# which generates the one hot encoded vector as an output.

In [6]:
n_hidden= 256
batch_size= 128
time_step= 100
input_dim= 1
output_dim= len(vocab)

In [7]:
X= tf.placeholder(shape= [None,time_step,input_dim], dtype= tf.float32)
Y= tf.placeholder(shape= [None,output_dim], dtype= tf.float32)

In [18]:
w_in= tf.Variable(tf.truncated_normal(shape= [input_dim,n_hidden], dtype= tf.float32, stddev= 0.001))
b_in= tf.Variable(tf.constant(0.001, shape= [n_hidden]))
w_h= tf.Variable(tf.truncated_normal(shape= [n_hidden,n_hidden], dtype= tf.float32, stddev= 0.001))
b_h= tf.Variable(tf.constant(0.001, shape= [n_hidden]))
w_out= tf.Variable(tf.truncated_normal(shape= [n_hidden,output_dim], dtype= tf.float32, stddev= 0.001))
b_out= tf.Variable(tf.constant(0.001, shape= [output_dim]))

In [9]:
X_in= tf.matmul(tf.reshape(X,(-1,input_dim)),w_in)+b_in
X_in= tf.reshape(X_in, (-1,time_step,n_hidden))

In [10]:
cells= tf.contrib.rnn.GRUCell(n_hidden)
multilayer_cell= tf.contrib.rnn.MultiRNNCell([cells,cells], state_is_tuple= True)
init_state= multilayer_cell.zero_state(batch_size, dtype= tf.float32)

In [11]:
output,state= tf.nn.dynamic_rnn(multilayer_cell,X_in, initial_state= init_state)

In [12]:
output= tf.unstack(tf.transpose(output, (1,0,2)))

In [13]:
y_= tf.matmul(output[-1],w_out)+b_out

In [14]:
loss= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits= y_, labels= Y))
optimize= tf.train.AdamOptimizer(0.001).minimize(loss)

In [19]:
init= tf.global_variables_initializer()
sess= tf.Session()
sess.run(init)

# Training Data Preparation

In [21]:
# convolving straings of length 101 is fed to the list named data
# where first 100 characters will be treated as the input and 
# last charater will be treated as output label.

data= []
for i in range(1000000):
    data.append(text[i:i+101])

# Training the model for 100 iterations

In [None]:
for i in range(100):
    total_loss= 0
    for j in range(5000):
        x_batch= [id_char(data[j][:-1]) for j in 
                  range(batch_size*j,batch_size*(j+1))]
        x_batch= np.array(x_batch)/len(vocab)
        y_batch= [vectorize_char(data[j][-1])[0] for j in 
                  range(batch_size*j,batch_size*(j+1))]
        
        cost, _= sess.run([loss,optimize], feed_dict= {X: x_batch,
                                                       Y: y_batch})
        total_loss+= cost
        print (i,j,cost)
        
    print (i,total_loss/5000)

# in_text is a seed text fed to the model on the basis of which the model generates next 500 characters.

In [44]:
in_text= 'on the basis of scientific evidence, this discovery can '
for i in range(500):
    blank= ' '*(time_step-len(in_text))
    in_= blank+in_text
    if len(in_)>100:
        in_= in_[len(in_)-100:]
    x_batch= np.zeros(shape= (batch_size,time_step,input_dim))

    x_batch[0]= id_char(in_)/len(vocab)
    y_pred= sess.run(y_, feed_dict= {X: x_batch.reshape(-1,time_step,input_dim)})
    index= sess.run(tf.argmax(y_pred[0],0))
    char= id2char[index]
    in_text+= char
print (in_text)

on the basis of scientific evidence, this discovery can be an invested the study says says the study in the mass of the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein the same traditions in the same protein 


In [5]:
# note that the combination of all the characters predicted by the
# model forms actual english words. Although sentences doesn't makes 
# sense but training the network for loger time and optimizing the
# hyperparameters definetly will improve the model.

# Another importent thing that can be noted from the prediction is
# that the theme of the predicted text is rougly scientific which
# is same as the theme of the training data, thus we could use this 
# model to generate text of specific theme and styles. For example
# Shakespere style text can be generated using this model simply by
# trainng the model over the corpus of Shakespere's writtings.