#  Character-Based Neural Language Model in Keras

 A language model predicts the next word in the sequence based on the specific words that have come before it in the sequence. This comes at the cost of requiring larger models that are slower to train. Nevertheless, in the field of neural language models, character-based models offer a lot of promise for a general, flexible and powerful approach to language modeling. In this section you will know:
 How to prepare text for character-based language modeling.
 How to develop a character-based language model using LSTMs.
 How to use a trained character-based language model to generate text


# Data Preparation

The first step is to prepare the text data. We will start by defining the type of language model.

In [12]:
from numpy import array
import tensorflow as tf
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 
                  mode='r', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
# define the model

# Clean Text 
Next, we need to clean the loaded text. We will not do much to it on this example. Specifically,
we will strip all of the new line characters so that we have one long sequence of characters
separated only by white space.



In [13]:
# save tokens to file, one dialog per line
# -*- coding: utf-8 -*-
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# load text
raw_text = load_doc('hojarasca1.txt')
#texto=hoja.txt.decode("utf-8")

#raw_text = load_doc('/floyd/input/dataset2/el_quijote.txt')
#raw_text = load_doc('rhyme.txt')

#print(raw_text)
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)


# Create Sequences
Now that we have a long list of characters, we can create our input-output sequences used to
train the model. Each input sequence will be 10 characters with one output character, making
each sequence 11 characters long. We can create the sequences by enumerating the characters
in the text, starting at the 11th character at index 10. The sequences are save in a file with
function save_doc()

In [14]:
# organize into sequences of characters
length = 3
sequences = list()
for i in range(length, len(raw_text)):
	# select sequence of tokens
	seq = raw_text[i-length:i+1]
	# store
	sequences.append(seq)
print('Total Sequences: %d' % len(sequences))
# save sequences to file
out_filename = 'char_sequences.txt'
#print (sequences)
save_doc(sequences, out_filename)
#print (sequences)

Total Sequences: 194958


# Train Language Model

The model will read encoded characters and predict the next character in the sequence. The first step is to load the prepared character sequence data from char sequences.txt. 

In [15]:

# load doc into memory
def load_doc(filename):
  # open the file as read only
  file = open(filename, 'r' )
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text
# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split( '\n' )
#print lines


# Dictionary Mapping
We can create the mapping given a sorted set of unique characters in the
raw input data. The mapping is a dictionary of character values to integer values.
 

In [16]:
# integer encode sequences of characters
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))
sequences = list()
for line in lines:
	# integer encode line
	encoded_seq = [mapping[char] for char in line]
	# store
	sequences.append(encoded_seq)
# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)
#print(mapping)
#print(sequences)

Vocabulary Size: 72


# Encode Sequences
The sequences of characters must be encoded as integers. This means that each unique character
will be assigned a specific integer value and each sequence of characters will be encoded as a
sequence of integers. We can separate the columns into input and
output sequences of characters. We can do this using a simple array slice.

In [17]:
# separate into input and output
#print(sequences, len(sequences))
#sess=tf.Session()
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#import os
#os.environ['TF_CPP_MIN_LOG_LEVEL']='2'  ## To deactivate SSE Warnings
import numpy as np
with tf.device('/device:GPU:0'):
    sequences=tf.convert_to_tensor(np.array(sequences))
    #sequences = array(sequences)
    #print(sess.run(sequences))
    X=sequences[:,:-1]
    y=sequences[:,-1] 
    #X=tf.one_hot(X,vocab_size)  
    #y=tf.one_hot(y,vocab_size)
sess = tf.Session(config=config)
#sess =  tf.Session(config=tf.ConfigProto(log_device_placement=True))
#sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,log_device_placement=True))
#sess.run(init)
print (sess.run(X))
print (sess.run(y))
#out_filename = 'char_sequences1.txt'
#print (sequences)
#save_doc(sess.run(y), out_filename)
#print (sequences)
sess.close()

#print('vectorización de secuencias')



[[71  1 33]
 [ 1 33 55]
 [33 55 48]
 ...
 [ 1 12 10]
 [12 10 11]
 [10 11 14]]
[55 48 63 ... 11 14  8]


In [18]:
#print (sequences[:,:-1],sequences[:,-1])
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.device('/device:GPU:1'):
   X=tf.one_hot(X,vocab_size)
   y=tf.one_hot(y,vocab_size)
   #X, y = sequences[:,:-1], sequences[:,-1]
   #print ('estas son las secuenciassssss', X,'de yyyyyy',y)
   #sequences = [to_categorical(x, num_clsteps_per_epoch=1000asses=vocab_size) for x in X]
   #X=tf.convert_to_tensor(np.array(sequences))
   #init = tf.global_variables_initializer()
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) #0.333
#sess = tf.Session(config=tf.ConfigProto(log_device_placement=True, gpu_options=gpu_options))   
sess = tf.Session(config=config)
#sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
#sess.run(init)
print (sess.run(X))
print (sess.run(y))
sess.close()

#X = array(sequences)
#y = to_categorical(y, num_classes=vocab_size)
#print (X,'este es el',y)

[[[0. 0. 0. ... 0. 0. 1.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Define the language model
The model is defined with an input layer that takes sequences that have 10 time steps and 38
features for the one hot encoded input sequences. The model has a single LSTM hidden layer with 75 memory cells, chosen with a little trial and error. The model has a fully connected output layer that outputs one vector with a probability distribution across all characters in the vocabulary. A softmax activation function is used on
the output layer to ensure the output has the properties of a probability distribution.


In [None]:
import tensorflow as tf

inputs = tf.keras.Input(shape=(3,))
x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [19]:
from pickle import dump
import tensorflow as tf

print(X.shape[1],vocab_size)
print (X.shape[1], X.shape[2],y.shape)
def define_model(X):
    model = Sequential()
    model.add(LSTM(75, input_shape=(3,72)))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

#dataset = tf.data.Dataset.from_tensor_slices((X,y))
#sess = tf.Session(config=config)
#print (dataset)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.device('/device:GPU:1'):
   model=define_model(X)
   #print (dataset)
   model.fit(X, y, epochs=100, steps_per_epoch=20, verbose=1)
   model.save('model.h5')
   # save the mapping
   dump(mapping, open('mapping.pkl', 'wb'))
sess = tf.Session(config=config)
sess.close()

3 72
3 72 (194958, 72)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                44400     
_________________________________________________________________
dense_1 (Dense)              (None, 72)                5472      
Total params: 49,872
Trainable params: 49,872
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100


In [46]:
from keras.models import load_model
model = load_model('model.h5')
mapping = load(open('mapping.pkl', 'rb'))


In [22]:
import numpy as np
from pickle import load
#from numpy import array
#from tensorflow.keras.models import load_model
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        #encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        #print(yhat)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break 
        # append to input
        in_text += out_char
    return in_text
# load the model
model = load_model('model.h5')
print (model)
# load the mapping
mapping = load(open('mapping.pkl', 'rb'))
print(mapping)
# test start of rhyme
print(generate_seq(model, mapping, 3, 'El viene cantando ', 25))
# test mid-line
#print(generate_seq(model, mapping, 3, 'A tu', 10))
# test not in original
#print(generate_seq(model, mapping, 20, 'The queen', 25))

<keras.engine.sequential.Sequential object at 0x7f0d27ba8240>
{'\n': 0, ' ': 1, '!': 2, '"': 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, '/': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '?': 22, 'A': 23, 'B': 24, 'C': 25, 'D': 26, 'E': 27, 'F': 28, 'G': 29, 'H': 30, 'I': 31, 'J': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41, 'U': 42, 'V': 43, 'X': 44, 'Y': 45, 'Z': 46, 'a': 47, 'b': 48, 'c': 49, 'd': 50, 'e': 51, 'f': 52, 'g': 53, 'h': 54, 'i': 55, 'j': 56, 'l': 57, 'm': 58, 'n': 59, 'o': 60, 'p': 61, 'q': 62, 'r': 63, 's': 64, 't': 65, 'u': 66, 'v': 67, 'x': 68, 'y': 69, 'z': 70, '\ufeff': 71}
El viene cantando en el cuando en el cuando
