<a href="https://colab.research.google.com/github/rinazbelhaj/EIP/blob/master/Assignment%206/Text_Generation_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


# Model as per the Blog

In [0]:
# Load Larger LSTM network and generate text
import sys
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# load ascii text and covert to lowercase
filename = "gdrive/My Drive/Data/EIP/wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

# create mapping of unique chars to integers, and a reverse mapping
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# define the checkpoint
filepath="best-model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# model fit
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

# load the network weights
filename = "best-model.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

print("Generated Text : ")
print("\n")
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Total Characters:  144442
Total Vocab:  46
Total Patterns:  144342
Epoch 1/20

Epoch 00001: loss improved from inf to 2.95682, saving model to best-model.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.95682 to 2.75359, saving model to best-model.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.75359 to 2.65016, saving model to best-model.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.65016 to 2.57686, saving model to best-model.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.57686 to 2.51626, saving model to best-model.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.51626 to 2.46120, saving model to best-model.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.46120 to 2.40950, saving model to best-model.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.40950 to 2.36051, saving model to best-model.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.36051 to 2.31563, saving model to best-model.hdf5
Epoch 10/20

Epoch 00010: loss improved from 2.31563 to 2.26936, saving m

# Revised Model with the following changes
1. Predict 500 characters only
2. Remove all the punctuation from the source text
3. Train the model on padded sequences rather than random sequences of characters. 
4. Train the model for 100 epochs
5. Add dropout to the input layer, remove it from the layer before dense layer. Use Dropout value of 0.1 everywhere.

In [0]:
# Load Larger LSTM network and generate text
import sys
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# load ascii text and covert to lowercase
filename = 'gdrive/My Drive/Data/EIP/wonderland.txt'
raw_text = open(filename).read()
raw_text = raw_text.lower()

Using TensorFlow backend.


In [0]:
# removing punctuations, \n , and unnecessary characters
import string 
punctuation = string.punctuation.replace(".", "") + '\n' + '\ufeff'
new_text = raw_text.replace("\n", " ").translate(str.maketrans('', '', punctuation))

# create mapping of unique chars to integers
chars = sorted(list(set(new_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
char_to_int

{' ': 0,
 '.': 1,
 '0': 2,
 '3': 3,
 'a': 4,
 'b': 5,
 'c': 6,
 'd': 7,
 'e': 8,
 'f': 9,
 'g': 10,
 'h': 11,
 'i': 12,
 'j': 13,
 'k': 14,
 'l': 15,
 'm': 16,
 'n': 17,
 'o': 18,
 'p': 19,
 'q': 20,
 'r': 21,
 's': 22,
 't': 23,
 'u': 24,
 'v': 25,
 'w': 26,
 'x': 27,
 'y': 28,
 'z': 29}

In [0]:
# summarize the loaded data
n_chars = len(new_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  137111
Total Vocab:  30


In [0]:
from keras.preprocessing.sequence import pad_sequences

# splitting text into sentences based on fullstops.
new_list = new_text.split(".") 

# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for sentence in new_list:
  raw_text = sentence
  for i in range(0, len(raw_text) , 1):
    seq_in = raw_text[i:i + min(len(raw_text)-i-1,seq_length)]
    seq_out = raw_text[i + min(len(raw_text)-i-1,seq_length)]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
  n_patterns = len(dataX)
dataX = pad_sequences(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  136121


In [0]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [0]:
# define the LSTM model
model = Sequential()
model.add(Dropout(0.1,input_shape=(X.shape[1], X.shape[2])))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(256))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

W0727 15:25:20.041573 140073765947264 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0727 15:25:20.068116 140073765947264 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_3 (Dropout)          (None, 100, 1)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 30)                7710      
Total params: 797,214
Trainable params: 797,214
Non-trainable params: 0
_________________________________________________________________


In [0]:
# define the checkpoint
filepath="best-model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [15]:
# fit the model
model.fit(X, y, epochs=100, batch_size=128, callbacks=callbacks_list)

Epoch 1/100

Epoch 00001: loss improved from inf to 2.80634, saving model to best-model.hdf5
Epoch 2/100

Epoch 00002: loss improved from 2.80634 to 2.45867, saving model to best-model.hdf5
Epoch 3/100

Epoch 00003: loss improved from 2.45867 to 1.89249, saving model to best-model.hdf5
Epoch 4/100

Epoch 00004: loss improved from 1.89249 to 1.59018, saving model to best-model.hdf5
Epoch 5/100

Epoch 00005: loss improved from 1.59018 to 1.45159, saving model to best-model.hdf5
Epoch 6/100

Epoch 00006: loss improved from 1.45159 to 1.36522, saving model to best-model.hdf5
Epoch 7/100

Epoch 00007: loss improved from 1.36522 to 1.30730, saving model to best-model.hdf5
Epoch 8/100

Epoch 00008: loss improved from 1.30730 to 1.26091, saving model to best-model.hdf5
Epoch 9/100

Epoch 00009: loss improved from 1.26091 to 1.22729, saving model to best-model.hdf5
Epoch 10/100

Epoch 00010: loss improved from 1.22729 to 1.20572, saving model to best-model.hdf5
Epoch 11/100

Epoch 00011: loss i

KeyboardInterrupt: ignored

In [0]:
# load the network weights
filename = "best-model.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [32]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = list(dataX[start])
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
print("\n")
print("\nGenerated Text : ")
# generate characters
for i in range(500):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")


Seed:
"  strange adventures of hers that you have just been reading about and when she had finished her sist "



Generated Text : 
en the wai  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and thatie  and that
Done.
