### I. Imports

In [262]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, TimeDistributed, Dense, Activation
from collections import Counter
from functools import reduce
import re

### II. Data Preparation

In [231]:
# Read limericks.csv downloaded from https://github.com/sballas8/PoetRNN/blob/master/data/limericks.csv
df_limericks = pd.read_csv('limericks.csv', names=['limericks'], nrows=10000)

In [232]:
df_limericks['limericks'] = df_limericks.limericks.apply((lambda x: x.replace('\r','')))
df_limericks.head()

Unnamed: 0,limericks
0,cap'n jack was washed over the side.\nhis crew...
1,"ablactation, to wean off the breast,\nshould w..."
2,"as a soup, bisque is best when served hot.\nma..."
3,simply add to the grasp of a rhesus\nthe antit...
4,"abed's where you sleep in the night,\nunless y..."


In [233]:
letter_count = dict(reduce((lambda x, y: x + y),list(map(Counter, df_limericks['limericks'].tolist()))))

In [234]:
infrequent_chars = [k for k, v in letter_count.items() if v < 1000]
print(''.join(infrequent_chars))

25134=+0987*/6	&[]_#@%$


In [235]:
df_limericks['limericks_clean'] = df_limericks['limericks'].str.replace(r'['+ re.escape(''.join(infrequent_chars))+ r']', '?', regex=True)
df_limericks.head()

Unnamed: 0,limericks,limericks_clean
0,cap'n jack was washed over the side.\nhis crew...,cap'n jack was washed over the side.\nhis crew...
1,"ablactation, to wean off the breast,\nshould w...","ablactation, to wean off the breast,\nshould w..."
2,"as a soup, bisque is best when served hot.\nma...","as a soup, bisque is best when served hot.\nma..."
3,simply add to the grasp of a rhesus\nthe antit...,simply add to the grasp of a rhesus\nthe antit...
4,"abed's where you sleep in the night,\nunless y...","abed's where you sleep in the night,\nunless y..."


In [236]:
valid_characters = ['0', '^', '$'] + sorted([k for k, v in letter_count.items() if k not in infrequent_chars])
print(valid_characters)

['0', '^', '$', '\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [245]:
padded_lims = np.array([[valid_characters.index(car) for car in '^'+x[:202]+'0'*(202-len(x[:202]))+'$' if car in valid_characters] for x in df_limericks['limericks_clean'].tolist()])
X = padded_lims[:,:203]
X.shape

(10000, 203)

In [246]:
Y = padded_lims[:,1:204]
Y = to_categorical(Y, num_classes=len(valid_characters))
Y.shape

(10000, 203, 42)

### III. Build a Model

In [257]:
embedding_dim = 50
hidden_size = 100
voc_size = len(valid_characters)

model = Sequential()
model.add(Embedding(voc_size, embedding_dim)) # E is embedding dimension, V is vocabulary size
model.add(LSTM(hidden_size, return_sequences=True))  
model.add(TimeDistributed(Dense(voc_size)))
model.add(Activation('softmax'))   
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 50)          2100      
_________________________________________________________________
lstm_9 (LSTM)                (None, None, 100)         60400     
_________________________________________________________________
time_distributed_9 (TimeDist (None, None, 42)          4242      
_________________________________________________________________
activation_9 (Activation)    (None, None, 42)          0         
Total params: 66,742
Trainable params: 66,742
Non-trainable params: 0
_________________________________________________________________


In [260]:
model.fit(X,Y, batch_size=256, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x14398ac88>

### IV. Text Generation

In [261]:
for j in range(2):
    sentence = []
    letter = [valid_characters.index('^')] #choose a random letter
    for i in range(200):
        sentence.append(valid_characters[letter[-1]])
        if sentence[-1]=='$' or sentence[-1]=='0':
            break
        p = model.predict(np.array(letter)[None,:])
        letter.append(np.random.choice(len(valid_characters),1,p=p[0][-1])[0])

    print(''.join(sentence))
    print('='*100)

^lyieg the ellation tiflout,
what in chammer we'nd lengine.
lit tames, ind't soise.
youh ax there,
and dom taking, "quim, "still,
actaredan wed whes letr, ti'soe.
0
^"wemall, busess are corlocty
that'd lake cae borey, whing a farat!
0


### BONUS - Use a GRU layer

In [263]:
embedding_dim = 50
hidden_size = 100
voc_size = len(valid_characters)

model = Sequential()
model.add(Embedding(voc_size, embedding_dim)) # E is embedding dimension, V is vocabulary size
model.add(GRU(hidden_size, input_shape = X.shape, return_sequences=True))  
model.add(TimeDistributed(Dense(voc_size)))
model.add(Activation('softmax'))   
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 50)          2100      
_________________________________________________________________
gru_1 (GRU)                  (None, None, 100)         45300     
_________________________________________________________________
time_distributed_10 (TimeDis (None, None, 42)          4242      
_________________________________________________________________
activation_10 (Activation)   (None, None, 42)          0         
Total params: 51,642
Trainable params: 51,642
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X,Y, batch_size=256, epochs=20)

In [None]:
for j in range(2):
    sentence = []
    letter = [valid_characters.index('^')] #choose a random letter
    for i in range(200):
        sentence.append(valid_characters[letter[-1]])
        if sentence[-1]=='$' or sentence[-1]=='0':
            break
        p = model.predict(np.array(letter)[None,:])
        letter.append(np.random.choice(len(valid_characters),1,p=p[0][-1])[0])

    print(''.join(sentence))
    print('='*100)