# Objective : Train a NN on "Alice in Wonderland" to generate text 

### Character level RNN for text generation

### 1. Package Mangement

In [194]:
from keras.layers import Dense, Activation, SimpleRNN, Flatten

In [21]:
from keras.models import Sequential

In [103]:
from keras.utils.vis_utils import plot_model ## Recently renamed from plot to plot_model.

In [104]:
import numpy as np

### 2. Data Processing
Load the data and get rid of line breaks etc

In [76]:
fin=open("./data/alice.txt",'rb')

# Preprocessing

lines=[]
for line in fin:
    line=line.strip().lower()
    line= line.decode('ascii','ignore')
    if len(line)==0:
        continue
    lines.append(line)
   
fin.close()
text=" ".join(lines) ## Returns a character sequence


Lets check a sample from the data

In [77]:
lines

['project gutenbergs alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  you may copy it, give it away or',
 're-use it under the terms of the project gutenberg license included',
 'with this ebook or online at www.gutenberg.org',
 'title: alices adventures in wonderland',
 'author: lewis carroll',
 'posting date: june 25, 2008 [ebook #11]',
 'release date: march, 1994',
 'last updated: october 6, 2016',
 'language: english',
 'character set encoding: utf-8',
 '*** start of this project gutenberg ebook alices adventures in wonderland ***',
 'alices adventures in wonderland',
 'lewis carroll',
 'the millennium fulcrum edition 3.0',
 'chapter i. down the rabbit-hole',
 'alice was beginning to get very tired of sitting by her sister on the',
 'bank, and of having nothing to do: once or twice she had peeped into the',
 'book her sister was reading, but it had no pictures or conversati

In [79]:
text[:2000] ## Char sequence

'project gutenbergs alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.  you may copy it, give it away or re-use it under the terms of the project gutenberg license included with this ebook or online at www.gutenberg.org title: alices adventures in wonderland author: lewis carroll posting date: june 25, 2008 [ebook #11] release date: march, 1994 last updated: october 6, 2016 language: english character set encoding: utf-8 *** start of this project gutenberg ebook alices adventures in wonderland *** alices adventures in wonderland lewis carroll the millennium fulcrum edition 3.0 chapter i. down the rabbit-hole alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, and what is the use of a book, thought alice without pictures or c

Note that text is a char sequence while lines is an array

In [80]:
type(text)

str

In [81]:
type(lines)

list

### 3. Build Vocabulary

For Char level RNNs we need to build a list of all characters present in the text
Next up we create look up tables with numbers assigned to each character. 

In [210]:
# Create list of chars in the text

chars=set([c for c in text])
## Get the number of unique chars in vocab
num_chars=len(chars)
num_chars

55

In [90]:
print(chars)

{'_', '5', 'f', '9', ')', '-', 'h', '4', ';', '/', 'c', '.', ' ', 'm', 'l', '0', '7', '1', '8', '!', 'g', 'e', 'j', 'q', 'z', 'b', '#', '%', 'w', '3', '(', 'k', 't', 's', ']', ',', 'd', 'u', 'i', ':', '[', '6', 'r', 'v', '*', '$', 'x', '2', 'y', '@', 'a', '?', 'p', 'n', 'o'}


In [93]:
# Char to Index vocab
char2index={c:i for i,c in enumerate(chars)}
print(char2index)

{'_': 0, '5': 1, 'f': 2, '9': 3, ')': 4, '-': 5, 'h': 6, '4': 7, ';': 8, '/': 9, 'c': 10, '.': 11, ' ': 12, 'm': 13, 'l': 14, '0': 15, '7': 16, '1': 17, '8': 18, '!': 19, 'g': 20, 'e': 21, 'j': 22, 'q': 23, 'z': 24, 'b': 25, '#': 26, '%': 27, 'w': 28, '3': 29, '(': 30, 'k': 31, 't': 32, 's': 33, ']': 34, ',': 35, 'd': 36, 'u': 37, 'i': 38, ':': 39, '[': 40, '6': 41, 'r': 42, 'v': 43, '*': 44, '$': 45, 'x': 46, '2': 47, 'y': 48, '@': 49, 'a': 50, '?': 51, 'p': 52, 'n': 53, 'o': 54}


In [94]:
#index to char vocab

index2char={i:c for i,c in enumerate(chars)}
print(index2char)

{0: '_', 1: '5', 2: 'f', 3: '9', 4: ')', 5: '-', 6: 'h', 7: '4', 8: ';', 9: '/', 10: 'c', 11: '.', 12: ' ', 13: 'm', 14: 'l', 15: '0', 16: '7', 17: '1', 18: '8', 19: '!', 20: 'g', 21: 'e', 22: 'j', 23: 'q', 24: 'z', 25: 'b', 26: '#', 27: '%', 28: 'w', 29: '3', 30: '(', 31: 'k', 32: 't', 33: 's', 34: ']', 35: ',', 36: 'd', 37: 'u', 38: 'i', 39: ':', 40: '[', 41: '6', 42: 'r', 43: 'v', 44: '*', 45: '$', 46: 'x', 47: '2', 48: 'y', 49: '@', 50: 'a', 51: '?', 52: 'p', 53: 'n', 54: 'o'}


### 4. Create input and target text


In [100]:
# Define params

print("Creating Input and label text")
SEQLEN = 10
STEP = 1

input_chars=[]
label_chars=[]

for i in range(0,len(text)-SEQLEN, STEP):
    input_chars.append(text[i:i+SEQLEN])
    label_chars.append(text[i+SEQLEN])
    


Creating Input and label text


In [101]:
input_chars[:20]

['project gu',
 'roject gut',
 'oject gute',
 'ject guten',
 'ect gutenb',
 'ct gutenbe',
 't gutenber',
 ' gutenberg',
 'gutenbergs',
 'utenbergs ',
 'tenbergs a',
 'enbergs al',
 'nbergs ali',
 'bergs alic',
 'ergs alice',
 'rgs alices',
 'gs alices ',
 's alices a',
 ' alices ad',
 'alices adv']

In [102]:
label_chars[:20]

['t',
 'e',
 'n',
 'b',
 'e',
 'r',
 'g',
 's',
 ' ',
 'a',
 'l',
 'i',
 'c',
 'e',
 's',
 ' ',
 'a',
 'd',
 'v',
 'e']

In [113]:
## Both these char vectors are of equal size

len(input_chars)==len(label_chars)

True

### 5. Vectorize the input and label chars 

In [211]:
print("###Vectorizing Input chars####")

print(" \nInput Sample ")
# Input size
# Total no of rows in input , size of each row = SEQLEN, Each char represented as a one hot encoding of the vocab size = num_chars
x= np.zeros((len(input_chars),SEQLEN,num_chars),dtype=np.bool)
print(x[:2]) # Sample

###Vectorizing Input chars####
 
Input Sample 
[[[False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]
  ..., 
  [False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]]

 [[False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]
  ..., 
  [False False False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]]]


In [212]:
print ("\n Output Sample")

# label\target\output size = No of rows in output(len(label_chars))  , encoding size = num of chars

y = np.zeros((len(label_chars), num_chars),dtype=np.bool)
y.shape


 Output Sample


(158773, 55)

In [205]:
print("Vectorizing the input now...")

for i, input_char in enumerate(input_chars):
    for j, ch in enumerate(input_char):
        x[i, j, char2index[ch]] = 1
    y[i, char2index[label_chars[i]]] = 1
    
print("Vectorization Done")

Vectorizing the input now...
Vectorization Done


In [147]:
# View a sample row
print(x[1])

[[False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False  True False False False False False
  False False False False False False False]
 [False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False  True]
 [False False False False False False False False False False False False
  False False False False False False False False False False  True False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False Fa

In [140]:
x.shape

(158773, 10, 55)

### 6. Build the RNN

In [191]:
# Model structure and  parameters

# RNN Layer Output size
HIDDEN_SIZE=128

BATCH_SIZE= 128
NUM_ITERATIONS=25
NUM_EPOCHS_PER_ITERATION=1
NUM_PREDS_PER_EPOCH=100

In [213]:
# Actual model construction starts here

model=Sequential()

# Add layers

#RNN -> No of Cells, Will the op be a seq or a single char, input shape , unroll -> performance benefit on tensorflow
model.add(SimpleRNN(HIDDEN_SIZE,return_sequences=False,input_shape=(SEQLEN,num_chars),unroll=True))
          
    
    
# Fully connected layer with size info          
model.add(Dense(num_chars))
          
# Softmax actication for classification          
model.add(Activation('softmax'))
          
# Define loss func and optimizer          
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

### 7. Train the Model

In [215]:
# Train the model on every iteration and also test it.. We do this since we dont have any labeled data

for iteration in range(NUM_ITERATIONS):
    # Train
    print("*"*50)
    print("Iteration Num : %d" % (iteration))
    
    model.fit(x,y,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS_PER_ITERATION)
    
    ### Test
    ## CHoose a random row from input_chars; use it to predict next 100 chars
    
    test_idx=np.random.randint(len(input_chars))
    seed_chars=input_chars[test_idx]
    print("Generating from seed --> %s" % (seed_chars))
    print(seed_chars,end="")
    
    
    for i in range(NUM_PREDS_PER_EPOCH):
        xtest=np.zeros((1,SEQLEN,num_chars))
        for i,c in enumerate(seed_chars):
            xtest[0,i,char2index[c]]=1
        pred= model.predict(xtest,verbose=0)[0]
        ypred=index2char[np.argmax(pred)]
        print(ypred,end='')
        
        # Move forward
        seed_chars=seed_chars[1:]+ypred

**************************************************
Iteration Num : 0
Epoch 1/1
Generating from seed --> ut a great
ut a great-k, .vx24d@2e:)cbx)eqx20*-7*n6gq,f.bxv48:2,%)x(88@af65-d7*n:.sm2yv4z:bb,1$[$;x)38q7fnzc(/:%,1)hu9-95**************************************************
Iteration Num : 1
Epoch 1/1
Generating from seed --> you may do
you may do5%$:88x2-f*v9:ms[std0969z[bqiesfn;go7*6uy2 c.t1%)*1xxo1-6@e,)2ko2j/66s42hn-r3k.ss46v;[:5be48h@2-%)1]**************************************************
Iteration Num : 2
Epoch 1/1
Generating from seed --> le was a p
le was a p5:5o9#xd-2?%fnk%q1v6u0d7u6n(]7xn#/x%th;sz.z9,/i-5f5kovw8:c,?$q14)h0q7xo9?,/p%td-eresf2q,vo;ysmwr?,c***************************************************
Iteration Num : 3
Epoch 1/1
Generating from seed --> oject gute
oject gute/ %#p5k*v7n:%-%n:5smfbv48.%0,8p@;sawth)*-%n:5s/[hv-r %.2j%)95q-#xet2jkbskgiv7xzrb/ql;;;:oy#6g92#0u7s**************************************************
Iteration Num : 4
Epoch 1/1
Generating from seed 