# Load Dataset

In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


In [2]:
raw_text = load_doc('dataset/ed_sheraan.txt')

In [3]:
print(raw_text)




# Dataset Preparation

### Remove Line Breaks

In [4]:
tokens = raw_text.split()

### Lowercase Tokens

In [5]:
tokens_lowercased = [aToken.lower() for aToken in tokens]

In [6]:
preprocessed_dataset = ' '.join(tokens_lowercased)

In [7]:
preprocessed_dataset

''

# Sequence Creation

In [8]:
sequence_length = 10

### Sequence format

Every sequences in the list for training contains __(sequence_length+1)__ characters, where the first
__sequence_length__ characters are input character sequence and the __(sequence_length+1)__ th character is
the output.


In [9]:
sequences = list()
for i in range(sequence_length, len(preprocessed_dataset)):
    seq = preprocessed_dataset[i-sequence_length: i+1]
    #print(seq)
    sequences.append(seq)

In [10]:
sequences[0:10]

[]

In [11]:
print("Total Sequences : ", len(sequences))

Total Sequences :  0


# Save the Sequenced Dataset to File
The saved processed dataset can be used later as starting point.

In [12]:
data = '\n'.join(sequences)

In [13]:
out_filePath = 'dataset/char_sequences.txt'
with open(out_filePath, 'w') as file:
    file.write(data)

In [14]:
!ls dataset

char_sequences.txt  ed_sheraan.txt  rhyme.txt


# Encode Sequences

### The Set of Characters in our sequence data

In [15]:
set(data)

set()

In [16]:
character_set = sorted(list(set(data)))

In [17]:
character_set

[]

### Integer Mapping
We represent each of the character in the sequences by a corresponding integer for fedding into
our ML model

In [18]:
mapping = dict((c, i) for i, c in enumerate(character_set))

In [19]:
mapping

{}

In [20]:
print("Vocabulary Size:: {}".format(len(mapping)))

Vocabulary Size:: 0


### Save the Mapping for Later Use (Character Generation)

In [21]:
from pickle import dump

In [22]:
dump(mapping, open('others/mapping.pkl', 'wb'))

### Sequence Encoding
We replace each of the character in sequences with their corresponding mapping, as obtained
above.

In [23]:
lines = data.split('\n')

In [24]:
int_encoded_sequences = list()

for line in lines:
    encoded_seq = [mapping[char] for char in line]
    int_encoded_sequences.append(encoded_seq)

In [25]:
int_encoded_sequences[0:10]

[[]]

# Split Sequences for Input and Output
For any given sequence, the first (such as, 10) characters are treated are input feature **X** and the last character is treated as the ouput i.e., __y__

In [26]:
from numpy import array
sequences = array(int_encoded_sequences)

In [27]:
sequences

array([], shape=(1, 0), dtype=float64)

In [28]:
X, y = sequences[:,:-1], sequences[:,-1]
X.shape

IndexError: index -1 is out of bounds for axis 1 with size 0

In [None]:
y.shape

In [None]:
X

In [None]:
y

# One-hot Encode
We one-hot encode each of the character in **X** and **y**. We use **to_categorical()** method of keras for the purpose.

In [None]:
from keras.utils import to_categorical
vocab_size = len(mapping)
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)

In [None]:
X

In [None]:
X.shape

In [None]:
X[0][0]

In [None]:
len(X[0])

In [None]:
len(X[0][0])

In [None]:
X[0][1]

# Fit Model

### Define the Model
As input the model takes **sequence_length** of time steps each containing **vocab_size** of one hot encoded features. Then use single LSTM layer with 75 memory units (i.e., can be changed with trail and error). Finally, the output layer is one vector with size of **vocab_size**. 

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# define model
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:
X.shape[1]

In [None]:
X.shape[2]

### Fit model
The model is trained 100 training epochs (i.e., can be changed with trail and error). The model uses **categorical_crossentropy** as loss function for its a multi-class classification problem. Using efficient **adam** for gradient descent. The model reports **accuracy** metric at the end of training of each batch.

In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)

### Save the Trained Model

In [None]:
model.save('models/model.h5')

# Generate Text 
We now use the trained model to generate character sequences.

As input the model takes **sequence_length** number of characters and generate or predict the next character that is likely to appear next. We then use the newly generated character in the sequence as the last character, while removing/truncating the very first character to generate another new character. The process is continued for the total number of characters expected.


In [29]:
from pickle import load
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
	in_text = seed_text
	# generate a fixed number of characters
	for _ in range(n_chars):
		# encode the characters as integers
		encoded = [mapping[char] for char in in_text]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# one hot encode
		encoded = to_categorical(encoded, num_classes=len(mapping))
		#encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
		# predict character
		yhat = model.predict_classes(encoded, verbose=0)
		# reverse map integer to character
		out_char = ''
		for char, index in mapping.items():
			if index == yhat:
				out_char = char
				break
		# append to input
		in_text += char
	return in_text

# load the model
model = load_model('models/model.h5')
# load the mapping
mapping = load(open('others/mapping.pkl', 'rb'))

# test start of rhyme
print(generate_seq(model, mapping, 10, 'so the bar', 50))


Using TensorFlow backend.


KeyError: 's'