In [1]:
from __future__ import print_function
import numpy as np


def generate_text(model, length, vocab_size, ix_to_char):
    
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

# method for preparing the training data
def load_data(data_dir, seq_length):
    data = open(data_dir, 'r').read()
    chars = list(set(data))
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}
    char_to_ix = {char:ix for ix, char in enumerate(chars)}

    X = np.zeros((len(data)/seq_length, seq_length, VOCAB_SIZE))
    y = np.zeros((len(data)/seq_length, seq_length, VOCAB_SIZE))
    for i in range(0, len(data)/seq_length):
        X_sequence = data[i*seq_length:(i+1)*seq_length]
        X_sequence_ix = [char_to_ix[value] for value in X_sequence]
        input_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            input_sequence[j][X_sequence_ix[j]] = 1.
            X[i] = input_sequence

        y_sequence = data[i*seq_length+1:(i+1)*seq_length+1]
        y_sequence_ix = [char_to_ix[value] for value in y_sequence]
        target_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            target_sequence[j][y_sequence_ix[j]] = 1.
            y[i] = target_sequence
    return X, y, VOCAB_SIZE, ix_to_char

In [2]:

import matplotlib.pyplot as plt

import time
import csv
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed

Using TensorFlow backend.


In [3]:
DATA_DIR = '/home/hrituraj/Dataset.txt'
BATCH_SIZE = 50
HIDDEN_DIM = 500
SEQ_LENGTH = 50

In [4]:
GENERATE_LENGTH = 50
LAYER_NUM = 2

In [5]:
X, y, VOCAB_SIZE, ix_to_char = load_data(DATA_DIR, SEQ_LENGTH)

Data length: 137629 characters
Vocabulary size: 81 characters


In [6]:
model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [7]:
generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)

1��SSk33SS;;JJPf/ffBBGGGGG--mm-SwwwCCCCWErrrrrrgg5

'1\xa7\xa7SSk33SS;;JJPf/ffBBGGGGG--mm-SwwwCCCCWErrrrrrgg55'

In [None]:
epochs = 10
for i in range(epochs):
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=1, nb_epoch=1)
    generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)



Epoch 1/1
 he             h                                 Epoch 1/1
he therererererererererererererererererererererereEpoch 1/1
� ther ther ther ther ther ther ther ther ther theEpoch 1/1
; the soone the soone the soone the soone the soonEpoch 1/1
" that her torter to that to the roor that to the Epoch 1/1
Ne the hard her hard her hard her hard her hard heEpoch 1/1