#### Assignment 11
Using section 8.1 in Deep Learning with Python as a guide, implement an LSTM text generator. Train the model on the Enron corpus or a text source of your choice. Save the model and generate 20 examples to the results directory of dsc650/assignments/assignment11/

In [2]:
import os
import json
from pathlib import Path

In [3]:
current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)
data_dir = current_dir.joinpath('data')
data_dir.mkdir(parents=True, exist_ok=True)
corpus_data_dir = data_dir.joinpath('corpus')

print(current_dir)
print(results_dir)
print(corpus_data_dir)

c:\Users\saman\git_repos\dsc650\dsc650\assignments\assignments11
c:\Users\saman\git_repos\dsc650\dsc650\assignments\assignments11\results
c:\Users\saman\git_repos\dsc650\dsc650\assignments\assignments11\data\corpus


In [4]:
# Downloading and parsing the initial text
import keras
import numpy as np

path = keras.utils.get_file(
	'3090-0.txt',
	origin='https://www.gutenberg.org/files/3090/3090-0.txt')
print('Downloaded into: ', path)
text = open(path,encoding = 'utf-8').read().lower()
print('Corpus length:', len(text))

Using TensorFlow backend.


Downloaded into:  C:\Users\saman\.keras\datasets\3090-0.txt
Corpus length: 2730110


In [5]:
# Vectorizing sequences of characters

maxlen = 60 # extract sequences of 60 characters
step = 3 # sample a new sequence every 3 characters
sentences = [] # holds the extracted sequences
next_chars = [] # holds the targets (in this case the next character)
for i in range(0, len(text) - maxlen, step):
	sentences.append(text[i: i + maxlen])
	next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences) )

Number of sequences: 910017


In [6]:
sentences[:10]

['\ufeffthe project gutenberg ebook of original short stories, comp',
 'e project gutenberg ebook of original short stories, complet',
 'roject gutenberg ebook of original short stories, complete, ',
 'ect gutenberg ebook of original short stories, complete, by ',
 ' gutenberg ebook of original short stories, complete, by guy',
 'tenberg ebook of original short stories, complete, by guy de',
 'berg ebook of original short stories, complete, by guy de ma',
 'g ebook of original short stories, complete, by guy de maupa',
 'book of original short stories, complete, by guy de maupassa',
 'k of original short stories, complete, by guy de maupassant\n']

In [7]:
next_chars[:10]

['l', 'e', 'b', 'g', ' ', ' ', 'u', 's', 'n', '\n']

In [17]:
# list of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary that maps unique characters to their index in the list "chars"
char_indices = dict((char, chars.index(char)) for char in chars)

Unique characters: 60


In [18]:
'  '.join(chars)

'\n     !  "  #  $  %  \'  (  )  *  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  ?  [  ]  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  —  “  ”  \ufeff'

In [21]:
char_indices

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '?': 27,
 '[': 28,
 ']': 29,
 'a': 30,
 'b': 31,
 'c': 32,
 'd': 33,
 'e': 34,
 'f': 35,
 'g': 36,
 'h': 37,
 'i': 38,
 'j': 39,
 'k': 40,
 'l': 41,
 'm': 42,
 'n': 43,
 'o': 44,
 'p': 45,
 'q': 46,
 'r': 47,
 's': 48,
 't': 49,
 'u': 50,
 'v': 51,
 'w': 52,
 'x': 53,
 'y': 54,
 'z': 55,
 '—': 56,
 '“': 57,
 '”': 58,
 '\ufeff': 59}

In [30]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
	for t, char in enumerate(sentence):
		x[i, t, char_indices[char]] = 1
	y[i, char_indices[next_chars[i]]] = 1


In [31]:
# Single layer LSTM model for next-character prediction
# Tis network is a single LSTM layer followed by a Dense classifier and softmax over all possible characters
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation = 'softmax'))

In [32]:
# Model compilation configuration
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)