# Exploring Data (lesson 1)

In [None]:
#importing packages
from __future__ import print_function
import os
import re
import string
from os import listdir
from os.path import isfile, join
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from numpy import array
from pickle import dump
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding


Starting of by downloading data and looking at the data, which we need to clean. We observe following
- Illustration texts
- The text file contain multiple stories, these should be split into multiple files in order for the model to know there is groupings (Extension: will be reviewed in lesson 5)
- The file contains a number of punctuations (,;.* etc.), which are irrelevant for the model
- Capital letters as headers

The input sequences are at first at 50 across sentences, chapters and stories. 

In [None]:
# creating function to load documents into memory
def load_doc(filename):
	file = open(filename, 'r', encoding="utf8") #loading an existing file
	text = file.read() #opening the file and assigning it to the variable text
	file.close() #close the file
	return text #output = text

In [None]:
os.chdir("C:\\Users\\au591024\\Desktop\\Data")
path = "C:\\Users\\au591024\\Desktop\\Data"

#concatenate text files
filenames = [f for f in listdir(path) if isfile(join(path, f))]

with open('adventures.txt', 'w', encoding="utf8") as outfile:
    for fname in filenames:
        with open(fname, encoding="utf8") as infile:
            outfile.write(infile.read())

In [None]:
# load document
in_filename = 'adventures.txt'#specifying the filename of the data we wish to load
doc = load_doc(in_filename) #loading the file
print(doc[:10000]) #printing the first 200 characters of the loading document

# Cleaning data (lesson 2)

In lesson 1, we observed several potential issues in the data, which we need to remove from the data. This is done below:

In [None]:
#removing illustration descriptions
doc = re.sub(r'\[[^)]*\]', '', doc) #using re.sub function and regular expressions 
#[ - an opening bracket 
#[^()]* - zero or more characters other than those defined, that is, any characters other than [ and ]
#\] - a closing bracket


In [None]:
#removing headers
doc = re.sub(r'[A-Z]{2,}','', doc) #replacing capital letters longer than 1 with nothing. 


In [None]:
#removing special characters, since one of the deliminators
#doc = doc.replace("'", '')

#deliminators 
#deliminators = c('THE FIR TREE', 'LITTLE TUK', 'THE UGLY DUCKLING', 'LITTLE IDA'S FLOWERS', 'THE STEADFAST TIN SOLDIER, LITTLE THUMBELINA, SUNSHINE STORIES', 'THE DARNING-NEEDLE', 'THE LITTLE MATCH GIRL', 'THE LOVING PAIR', 'THE LEAPING MATCH', 'THE HAPPY FAMILY, THE GREENIES, 'OLE-LUK-OIE', 'THE DREAM GOD', 'THE MONEY BOX', 'ELDER-TREE MOTHER', 'THE SNOW QUEEN', 'THE ROSES AND THE SPARROWS', 'THE OLD HOUSE', 'THE CONCEITED APPLE BRANCH' 

#splitting the file into strings consisting of one adventure
#doc = doc.split ()

In [None]:
# turn a doc into clean tokens
def clean_doc(doc): #making a function
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token - https://www.geeksforgeeks.org/python-maketrans-translate-functions/
	table = str.maketrans('', '', string.punctuation) #Third argument specifies the wished deleted items
	tokens = [w.translate(table) for w in tokens] #translate applies the translation table applied on the looping through the tokens list
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()] #Replace word with word, if the word is alphabetic
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [None]:
tokens = clean_doc(doc) #using the clean function to cleaning our document
print(tokens[:200])
print('Total Tokens: %d' % len(tokens)) #printing the number of tokens
print('Unique Tokens: %d' % len(set(tokens))) #printing the number of unique tokens by grouping similair tokens

In [None]:
# organize into sequences of tokens
length = 50 + 1 #defining the length of the sequence
sequences = list() #creating an empty list
for i in range(length, len(tokens)): #range takes two arguments: start point and end point
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq) #-join returns a string in which the string elements of sequence have been joined by str separator
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

In [None]:
# save tokens to file, one dialog per line
def save_doc(lines, filename): #creating a saving function
	data = '\n'.join(lines) #joining the lines seperated by 'enter'
	file = open(filename, 'w', encoding="utf8") #create empty new file
	file.write(data) #input the data in the empty file
	file.close() #close the file

In [None]:
# save sequences to file
out_filename = 'fairytales_sequences.txt'
save_doc(sequences, out_filename)

# Training the model (Lesson 3)

In [None]:
# loading the cleaned data
in_filename = 'fairytales_sequences.txt'
doc = load_doc(in_filename)

#split the text by lineshifts creating a list of 51 items long string
lines = doc.split('\n')

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer() #assign the tokenizer function to a variable
tokenizer.fit_on_texts(lines) #function of keras finds all of the unique words in the data and assigns each a unique integer
sequences = tokenizer.texts_to_sequences(lines) #translating the input lines into integers
tokenizer.word_index #checking the dictionary of the transformed wordsb

In [None]:
#calculating vocabulary size to estimate the size of the embedding layer
vocab_size = len(tokenizer.word_index) + 1 #since indexing of array are zero-offset, the index of the vocabulary must be one larger than the length
vocab_size

In [None]:
# separate into input and output
sequences = array(sequences) #transforming the sequens of integers to arrays
X, y = sequences[:,:-1], sequences[:,-1] #defining X (input sequences) and y (output words)
y = to_categorical(y, num_classes=vocab_size) #to_categorical converts a class vector (integers) to binary class matrix
seq_length = X.shape[1] #gives you the dimension of the array, which we put to be 50
seq_length

We will use a two LSTM hidden layers with 100 memory cells each. More memory cells and a deeper network may achieve better results.

In [None]:
# define model
model = Sequential() #assigning the sequential function to a model
model.add(Embedding(vocab_size, 50, input_length=seq_length)) #defining embedding layer size
model.add(LSTM(100, return_sequences=True)) #adding layer of nodes
model.add(LSTM(100))  #adding layer of nodes
model.add(Dense(100, activation='relu')) #specifying the structure of the hidden layer, recu is an argument of a rectified linear unit. 
model.add(Dense(vocab_size, activation='softmax')) #using the softmax function to creating probabilities
print(model.summary())

In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #the compile function configures the model for training specifying the categorical cross entropy loss
# fit model
model.fit(X, y, batch_size=128, epochs=100) #training the model 

In [None]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

# Lesson 4

In [None]:
# load cleaned text sequences
doc = load_doc('fairytales_sequences.txt')
lines = doc.split('\n')

In [None]:
#Specifying input sequences length to prompt the model
seq_length = len(lines[3].split()) - 1 #splitting the sequences into words, counting them -1
seq_length

In [None]:
#loading model
model= load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb')) #r for read

In [None]:
#selecting a seed text
seed_text = lines[randint(0,len(lines))] #returns random integer between 0 and how many lines there is, and indexes this. 
print(seed_text + '\n') #prints the selected text
len(seed_text.split())

In [None]:
#translating the input text to integers
encoded = tokenizer.texts_to_sequences([seed_text])[0]
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

The model can predict the next word directly by calling model.predict_classes() that will return the index of the word with the highest probability

In [None]:
# predict probabilities for each word
yhat = model.predict_classes(encoded, verbose=1)

Looking up the index in the Tokenizers mapping to get the associated word

In [None]:
#making a loop to translate integer to word
out_word = ''
for word, index in tokenizer.word_index.items():
	if index == yhat:
		out_word = word
		break

out_word

The input sequences will get too long, in order to keep them to 50 items using the following function, which pads sequences to the same lengt

In [None]:
#Setting the max length to be 50 items by removing items from the beginnning of the sequence
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

In [None]:
#creating a function, which generates the predicted output
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list() #make an empty list
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [None]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 100)
print(generated)

#saving generated text
out_filename = 'first_output.txt'
save_doc(generated, out_filename)


Things to be consider to advance the model: 

- We could process the data so that the model only ever deals with self-contained sentences and pad or truncate the text to meet this requirement for each input sequence. You could explore this as an extension to this tutorial.
- MORE DATA!

