In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dropout
import keras.utils as ku 
import numpy as np
import pandas as pd
from pandas import DataFrame

ModuleNotFoundError: No module named 'keras'

In [49]:
data = """The cat and her kittens
They put on their mittens,
To eat a Christmas pie.
The poor little kittens."""


In [108]:
len(data)

99

There will be three main parts of the code: dataset preparation, model training, and generating prediction. The boiler plate code of this architecture is following:



In dataset preparation step, we will first perform Tokenization. Tokenization is a process of 
extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization 
which can be used to obtain the tokens and their index in the corpus.

In [50]:
tokenizer = Tokenizer()

In [51]:
#a = "The cat and her kittens,They put on their mittens,To eat a Christmas pie. The poor little kittens,They lost their mittens, And then they began to cry.mother dear, we sadly fear,We cannot go to-day,For we have lost our mittens.If it be so, ye shall not go,For ye are naughty kittens."

In [73]:
def dataset_preparation(data):
    corpus = data.lower().split("\n")    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    return corpus,total_words

In [74]:
corpus,total_words = dataset_preparation(data)

In [75]:
corpus

['the cat and her kittens',
 'they put on their mittens,',
 'to eat a christmas pie.',
 'the poor little kittens.']

In [55]:
total_words

18

Next, we need to convert the corpus into a flat dataset of sentence sequences.

In [56]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(0, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [57]:
token_l = tokenizer.texts_to_sequences([data])[0]
len(token_l)    

19

In [78]:
input_sequences[18]

array([ 0,  1, 16, 17,  2], dtype=int32)

In [80]:
input_sequences.shape

(19, 5)

In [81]:
input_sequences[:5]

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 1, 3],
       [0, 0, 1, 3, 4],
       [0, 1, 3, 4, 5],
       [1, 3, 4, 5, 2]], dtype=int32)

Now that we have generated a data-set which contains sequence of tokens, it is possible that different sequences 
have different lengths. Before starting training the model, we need to pad the sequences and make their lengths equal. 
We can use pad_sequence function of Kears for this purpose.

In [82]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_len, padding='pre'))

In [83]:
input_sequences.shape

(19, 5)

In [84]:
max_sequence_len

5

In [111]:
input_sequences

array([[ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  1,  3],
       [ 0,  0,  1,  3,  4],
       [ 0,  1,  3,  4,  5],
       [ 1,  3,  4,  5,  2],
       [ 0,  0,  0,  0,  6],
       [ 0,  0,  0,  6,  7],
       [ 0,  0,  6,  7,  8],
       [ 0,  6,  7,  8,  9],
       [ 6,  7,  8,  9, 10],
       [ 0,  0,  0,  0, 11],
       [ 0,  0,  0, 11, 12],
       [ 0,  0, 11, 12, 13],
       [ 0, 11, 12, 13, 14],
       [11, 12, 13, 14, 15],
       [ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  1, 16],
       [ 0,  0,  1, 16, 17],
       [ 0,  1, 16, 17,  2]], dtype=int32)

To input this data into a learning model, we need to create predictors and label.
We will create N-grams sequence as predictors and 
the next word of the N-gram as label.

"""
Sentence: "we are doing great in Styfi"
PREDICTORS             | LABEL
we                     | are
we are                 | doing
we are doing           | great
we are doing great     | in
we are doing great in  | styfi
"""

In [127]:
a = np.array([[2,3],[4,5]])
a[:,-1]

array([3, 5])

In [114]:
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
#

In [128]:
label

array([ 1,  3,  4,  5,  2,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  1, 16,
       17,  2], dtype=int32)

In [115]:
input_sequences,predictors

(array([[ 0,  0,  0,  0,  1],
        [ 0,  0,  0,  1,  3],
        [ 0,  0,  1,  3,  4],
        [ 0,  1,  3,  4,  5],
        [ 1,  3,  4,  5,  2],
        [ 0,  0,  0,  0,  6],
        [ 0,  0,  0,  6,  7],
        [ 0,  0,  6,  7,  8],
        [ 0,  6,  7,  8,  9],
        [ 6,  7,  8,  9, 10],
        [ 0,  0,  0,  0, 11],
        [ 0,  0,  0, 11, 12],
        [ 0,  0, 11, 12, 13],
        [ 0, 11, 12, 13, 14],
        [11, 12, 13, 14, 15],
        [ 0,  0,  0,  0,  1],
        [ 0,  0,  0,  1, 16],
        [ 0,  0,  1, 16, 17],
        [ 0,  1, 16, 17,  2]], dtype=int32), array([[ 0,  0,  0,  0],
        [ 0,  0,  0,  1],
        [ 0,  0,  1,  3],
        [ 0,  1,  3,  4],
        [ 1,  3,  4,  5],
        [ 0,  0,  0,  0],
        [ 0,  0,  0,  6],
        [ 0,  0,  6,  7],
        [ 0,  6,  7,  8],
        [ 6,  7,  8,  9],
        [ 0,  0,  0,  0],
        [ 0,  0,  0, 11],
        [ 0,  0, 11, 12],
        [ 0, 11, 12, 13],
        [11, 12, 13, 14],
        [ 0,  0,  0,  0],


In [116]:
label.shape

(19,)

In [105]:
label = ku.to_categorical(label, num_classes=total_words)

In [107]:
label.shape

(19, 18)

In [17]:
predictors.shape

(48, 7)

In [18]:
label[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [19]:
label.shape

(48, 43)

Lets architecture a LSTM model in our code. I have added total three layers in the model.

Input Layer : Takes the sequence of words as input
LSTM Layer : Computes the output using LSTM units. I have added 100 units in the layer, but this number can be fine tuned later.
Dropout Layer : A regularisation layer which randomly turns-off the activations of some neurons in the LSTM layer. It helps in preventing over fitting.
Output Layer : Computes the probability of the best possible next word as output

In [20]:
max_sequence_len

8

In [21]:
total_words

43

In [22]:
def create_model(predictors, label, max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 10`, input_length=input_len))
    model.add(LSTM(1500))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.fit(predictors, label, epochs=100, verbose=1)
    return model

In [23]:
total_words

43

In [24]:
print(Embedding(total_words, 10, input_length=7))

<keras.layers.embeddings.Embedding object at 0x7f93532b21d0>


In [25]:
def generate_text(seed_text, next_words, max_sequence_len, model):
    for j in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen= 
                             max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
  
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [26]:
X = predictors
Y = label

In [27]:
predictors.shape

(48, 7)

In [28]:
Y.shape

(48, 43)

In [29]:
#create_model(X,Y,max_sequence_len,total_words)

In [31]:
from keras.models import load_model
# #model = load_model('/home/leena/Sohel/weights-improvement-56-3.60.hdf5')
model = load_model('/home/leena/Sohel/weights-improvement-16-4.51.hdf5')

In [32]:
text = generate_text("we naughty", 3, max_sequence_len, model)
#print text

ValueError: Error when checking input: expected embedding_1_input to have shape (108,) but got array with shape (7,)

In [6]:
import keras
from keras.layers import RNN

In [7]:
# First, let's define a RNN Cell, as a layer subclass.

class MinimalRNNCell(keras.layers.Layer):

    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        super(MinimalRNNCell, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
                                      initializer='uniform',
                                      name='kernel')
        self.recurrent_kernel = self.add_weight(
            shape=(self.units, self.units),
            initializer='uniform',
            name='recurrent_kernel')
        self.built = True

    def call(self, inputs, states):
        prev_output = states[0]
        h = K.dot(inputs, self.kernel)
        output = h + K.dot(prev_output, self.recurrent_kernel)
        return output, [output]

# Let's use this cell in a RNN layer:

cell = MinimalRNNCell(32)
x = keras.Input((None, 5))
layer = RNN(cell)
y = layer(x)

# Here's how to use the cell to build a stacked RNN:

cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
x = keras.Input((None, 5))
layer = RNN(cells)
y = layer(x)


NameError: name 'K' is not defined

In [129]:
e = Embedding(5, 2, input_length=5)

<keras.layers.embeddings.Embedding at 0x7f597b92f320>