# Char-RNN

In [1]:
import sys
import random
import numpy as np

from itertools import chain
from hyperdash import Experiment
from contextlib import redirect_stdout, redirect_stderr

In [2]:
from keras.models import Sequential
from keras.callbacks import Callback
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence, one_hot
from keras.utils import to_categorical, print_summary, plot_model, Sequence
from keras.layers import LSTM, CuDNNLSTM, Dense, TimeDistributed, Activation, GRU

Using TensorFlow backend.


## Args

In [3]:
GPU_ACTIVE = False

## Load

In [4]:
%store -r descs
len(descs)

171161

In [5]:
random.sample(descs, 10)

['Save current cursor selection and return position bounds',
 'Create a :class:`GoogleAPICallError` from a :class:`requests.Response`. DCNL Args: DCNL response (requests.Response): The HTTP response. DCNL Returns: DCNL GoogleAPICallError: An instance of the appropriate subclass of DCNL :class:`GoogleAPICallError`, with the message and errors populated DCNL from the response.',
 'Test the error response when the activity_create API is called DCNL with an authorization header for a user who is not authorized to DCNL create activities.',
 'Get a permitted action by its dict key or action name',
 'Called when a moderator accepts a comment. After the method is DCNL called the comment should be displayed to all users. DCNL :param comment_id: The id of the comment being accepted.',
 'increment the current intensity and reset counter',
 'Executes the FTP NLST command on the given path.',
 'Return an iterator over the values in the dictionary.  Values are DCNL iterated over in sorted order of t

In [6]:
MAX_SEQ_LEN = max(len(desc) for desc in descs)
MAX_SEQ_LEN

500

## Preprocessing

In [7]:
def chars_split(descs):
    chars = list(set(chain.from_iterable(desc for desc in descs)))
    char_ix = {char:ix for ix, char in enumerate(chars)}
    ix_char = {ix:char for ix, char in enumerate(chars)}
    descs = [[char_ix[char] for char in desc] for desc in descs]
    return descs, char_ix, ix_char

In [8]:
%time descs, dir_map, rev_map = chars_split(descs)
list(dir_map.items())[:5], list(rev_map.items())[:5]

CPU times: user 1.67 s, sys: 60 ms, total: 1.73 s
Wall time: 1.73 s


([('o', 0), ('*', 1), ('m', 2), ('(', 3), ('z', 4)],
 [(0, 'o'), (1, '*'), (2, 'm'), (3, '('), (4, 'z')])

In [9]:
VOCAB_SIZE = len(dir_map)
VOCAB_SIZE

95

## TT prepare

In [10]:
BATCH_SIZE = 64

In [11]:
class TTSequence(Sequence):
    def __init__(self):
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.floor(len(descs) / BATCH_SIZE))
    
    def __getitem__(self, idx):
        batch_ids = self._ids[idx * BATCH_SIZE: (idx + 1) * BATCH_SIZE]
        seq_len = max(len(descs[di]) for di in batch_ids)
        X = np.zeros((BATCH_SIZE, seq_len, VOCAB_SIZE))
        y = np.zeros_like(X)
        
        for bi, di in enumerate(batch_ids):
            for pi, wi in enumerate(descs[di]):
                X[bi, pi, wi] = 1
            
            for pi, wi in enumerate(descs[di][1:]):
                y[bi, pi, wi] = 1
        
        return X, y
    
    def on_epoch_end(self):
        """Generate new shuffle in between epochs."""
        self._ids = np.random.permutation(len(descs))

In [12]:
tts = TTSequence()
len(tts), tts[0][0].shape, tts[0][1].shape

(2674, (64, 452, 95), (64, 452, 95))

## Model

In [13]:
HIDDEN_DIM = 100
N_LAYERS = 1
LSTM_CLASS = GRU  # LSTM if not GPU_ACTIVE else CuDNNLSTM

In [14]:
model = Sequential()
model.add(LSTM_CLASS(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), 
                     dropout=0.3, return_sequences=True))
for i in range(N_LAYERS - 1):
    model.add(LSTM_CLASS(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print_summary(model)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, None, 100)         58800     
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 95)          9595      
Total params: 68,395
Trainable params: 68,395
Non-trainable params: 0
_________________________________________________________________


In [15]:
# plot_model(model)

## Learn

In [16]:
class HDLoss(Callback):
    def on_train_begin(self, logs={}):
        self.exp = Experiment('2.1.1: ChaRNN convergence', capture_io=False)
        
        # SUPER-hacky, but it's work (needed to supress hd output)
        self.exp._hd.out_buf.write = lambda _: _
    
    def on_train_end(self, logs={}):
        self.exp.end()

    def on_batch_end(self, n_batch, logs={}):
        self.exp.metric('n_batch', n_batch)
        self.exp.metric('loss', logs.get('loss'))

In [17]:
model.fit_generator(TTSequence(), verbose=1, epochs=5,
                    callbacks=[HDLoss()],
                    use_multiprocessing=True);

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Test

In [20]:
def generate_text(length):
    """Generate text with specific length."""
    assert length >= 1
    
    ix = [np.random.randint(VOCAB_SIZE)]
    ys = [rev_map[ix[-1]]]
    X = np.zeros((1, length, VOCAB_SIZE))
    
    for i in range(length - 1):
        X[0, i, ix[-1]] = 1
        ix = np.argmax(model.predict(X[:, :i + 1, :])[0], 1)
        ys.append(rev_map[ix[-1]])
    
    return ''.join(ys)

In [44]:
generate_text(45)

'Query and returns a service is a service DCNL'