**Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, GRU
from keras.callbacks import ModelCheckpoint

**Pre-processing the Text Data**

In [6]:
import re

def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString)
    long_words=[]
    # remove short word
    for i in newString.split():
        if len(i)>=3:                  
            long_words.append(i)
    return (" ".join(long_words)).strip()

In [9]:
data = """Once upon a TIME, in a LAND far, FAR away—there LIVED a smAll but COURAGEOUS cat named WhiskerS! ShE would ofTen sPriNg and Pounce, chasing dappled shadows & curious LIGHTS: illuminating a PATH towards hidden TREASURES. In her AdVenturEs, she'd encounter dArK creatures, mystical SPELLS, and ALLY unexpected, LIGHTNING-FAST and slow-moving, one step at a time, WhiskerS wOuld OVERCOME each trial. Victorious and UNYIELDING, she'd return to her cozy abode, MAJESTICALLY PURRING with contentment 10/10"""

# preprocess the text
data_new = text_cleaner(data)
print(data_new)

once upon time land far far away there lived small but courageous cat named whiskers she would often spring and pounce chasing dappled shadows curious lights illuminating path towards hidden treasures her adventures she encounter dark creatures mystical spells and ally unexpected lightning fast and slow moving one step time whiskers would overcome each trial victorious and unyielding she return her cozy abode majestically purring with contentment


### Once the sequences are generated, the next step is to encode each character.

**Encoding Sequences**

In [11]:
# Create a character mapping index
chars = sorted(list(set(data_new)))
mapping = dict((c, i) for i, c in enumerate(chars))
print(mapping)

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(data_new)
print(sequences)

{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25}
[[15], [14], [3], [5], [0], [20], [16], [15], [14], [0], [19], [9], [13], [5], [0], [12], [1], [14], [4], [0], [6], [1], [17], [0], [6], [1], [17], [0], [1], [22], [1], [24], [0], [19], [8], [5], [17], [5], [0], [12], [9], [21], [5], [4], [0], [18], [13], [1], [12], [12], [0], [2], [20], [19], [0], [3], [15], [20], [17], [1], [7], [5], [15], [20], [18], [0], [3], [1], [19], [0], [14], [1], [13], [5], [4], [0], [22], [8], [9], [18], [11], [5], [17], [18], [0], [18], [8], [5], [0], [22], [15], [20], [12], [4], [0], [15], [6], [19], [5], [14], [0], [18], [16], [17], [9], [14], [7], [0], [1], [14], [4], [0], [16], [15], [20], [14], [3], [5], [0], [3], [8], [1], [18], [9], [14], [7], [0], [4], [1], [16], [16], [12], [5], [4], [0], [18], [8], [1], [4], [15], [22], [18], 

### Building a Model

**We will use the embedding layer of Keras to learn a 50 dimension embedding for each character. This helps the model in understanding
complex relationships between characters. We will also use a GRU layer as the base model, which has 150 time steps.
Finally,  a dense layer with a Softmax activation for prediction.**

In [12]:
# Define vocabulary size
vocab = len(chars)

# Define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 50)            1300      
                                                                 
 gru (GRU)                   (None, 150)               90900     
                                                                 
 dense (Dense)               (None, 26)                3926      
                                                                 
Total params: 96126 (375.49 KB)
Trainable params: 96126 (375.49 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [13]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Fitting the model
# model.fit(X, y, epochs=100, verbose=2)

**Once the model has finished training, we can generate text from the model given an input sequence**

In [14]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text