# Char-RNN

In [1]:
import random
import numpy as np

from itertools import chain
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence, one_hot
from keras.utils import to_categorical, print_summary, plot_model, Sequence
from keras.layers import LSTM, CuDNNLSTM, Dense, TimeDistributed, Activation

Using TensorFlow backend.


## Args

In [2]:
GPU_ACTIVE = False

## Load

In [3]:
%store -r descs
len(descs)

171161

In [4]:
random.sample(descs, 10)

['See if there is an snapshot present that should be removed.',
 'Get the latest data and update state.',
 'Mark the filter as being ordered if search has occurred.',
 "Retrieve the distance parameters for the given geometry field, DCNL distance lookup value, and the distance lookup type. DCNL This is the most complex implementation of the spatial backends due to DCNL what is supported on geodetic geometry columns vs. what\\'s available on DCNL projected geometry columns.  In addition, it has to take into account DCNL the newly introduced geography column type introudced in PostGIS 1.5.",
 'Reset the marker to the first line.',
 'Called to stop continuous host logging.',
 'can verify that a location exists, using ccx block usage key',
 'Decoding iterator. DCNL Decodes the input strings from the iterator using an IncrementalDecoder. DCNL errors and kwargs are passed through to the IncrementalDecoder DCNL constructor.',
 'version known to trigger an INFO response message.',
 'Evaluates t

In [5]:
MAX_SEQ_LEN = max(len(desc) for desc in descs)
MAX_SEQ_LEN

500

## Preprocessing

In [6]:
def chars_split(descs):
    chars = list(set(chain.from_iterable(desc for desc in descs)))
    char_ix = {char:ix for ix, char in enumerate(chars)}
    ix_char = {ix:char for ix, char in enumerate(chars)}
    descs = [[char_ix[char] for char in desc] for desc in descs]
    return descs, char_ix, ix_char

In [7]:
%time descs, dir_map, rev_map = chars_split(descs)
list(dir_map.items())[:5], list(rev_map.items())[:5]

CPU times: user 1.56 s, sys: 60 ms, total: 1.62 s
Wall time: 1.63 s


([('|', 0), ('6', 1), ('<', 2), ('l', 3), ('-', 4)],
 [(0, '|'), (1, '6'), (2, '<'), (3, 'l'), (4, '-')])

In [8]:
VOCAB_SIZE = len(dir_map)
VOCAB_SIZE

95

## TT prepare

In [9]:
BATCH_SIZE = 128

In [46]:
class TTSequence(Sequence):
    def __init__(self):
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.floor(len(descs) / BATCH_SIZE))
    
    def __getitem__(self, idx):
        X = np.zeros((BATCH_SIZE, MAX_SEQ_LEN, VOCAB_SIZE))
        
        batch_ids = self._ids[idx * BATCH_SIZE: (idx + 1) * BATCH_SIZE]
        for bi, di in enumerate(batch_ids):
            for pi, wi in enumerate(descs[di]):
                X[bi, pi, wi] = 1
        
        return X
    
    def on_epoch_end(self):
        """Generate new shuffle in between epochs."""
        self._ids = np.random.permutation(len(descs))

In [55]:
tts = TTSequence()
len(tts), tts[0].shape

(1338, (128, 500, 95))

## Model

In [11]:
HIDDEN_DIM = 300
N_LAYERS = 2
LSTM_CLASS = LSTM if not GPU_ACTIVE else CuDNNLSTM

In [12]:
model = Sequential()
model.add(LSTM_CLASS(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(N_LAYERS - 1):
    model.add(LSTM_CLASS(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print_summary(model)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, None, 300)         475200    
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 300)         721200    
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 95)          28595     
Total params: 1,224,995
Trainable params: 1,224,995
Non-trainable params: 0
_________________________________________________________________


In [24]:
# plot_model(model)

## Learn

In [12]:
...

Ellipsis