#Text Generation using LSTM
>PARTH SAGAR
>210431117

## README
* Note: these execution steps are recommended for GOOGLE COLAB
### Steps to replicate:
1. Upload file.txt file to your runtime.
2. Run the first cell for data cleaning and generating doc file for training and testing.
3. Training can be done by running the second cell, it is recommended to download files generated here in runtime and model as if runtime disconnects they can be used for testing and future use.
4. for testing the model needs to be uploaded (if training and testing aren't done in sequence) , along with the file sequences.txt provided in the folder.


In [None]:
#to process the data files
import string 
 
# loading the document in the memory
def load_doc(filename):
	# opening the file in read only mode
	document = open(filename, 'r')
	# reading the text data
	data = document.read()
	# closing the file
	document.close()
	return data
 
# turning the document into tokens after cleaning
def clean_doc(file):
	# replacing '--' with spaces ' '
	file = file.replace('--', ' ')
	# tokens are being created after splitting on spaces
	tokens = file.split()
	# removing the punctuations by creating a translation table
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# removing all the non alphabetic characters
	tokens = [word for word in tokens if word.isalpha()]
	# converting all text into lower case
	tokens = [word.lower() for word in tokens]
	return tokens
 
# saving all the tokens in a file with one dialog being fed in one line
def save_doc(lines, filename):
  # forming the data line by line
	data = '\n'.join(lines)
  # opening file in write mode.
	file = open(filename, 'w')
  # writing the data in the file
	file.write(data)
  # closing the file 
	file.close()
 
# loading the document
in_filename = "/content/file.txt"
doc = load_doc(in_filename)
# printing the first 200 characters
print(doc[:200])
 
# cleaning the loaded document
tokens = clean_doc(doc)
# printing the cleaned document that is first 200 words
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
 
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# storing them as a list of sequences
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))
 
# saving sequences to a file
out_filename = 'sequences.txt'
save_doc(sequences, out_filename)

The Project Gutenberg EBook of Poirot Investigates, by Agatha Christie

This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost n
['the', 'project', 'gutenberg', 'ebook', 'of', 'poirot', 'investigates', 'by', 'agatha', 'christie', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'if', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', 'youll', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before', 'using', 'this', 'ebook', 'title', 'poirot', 'inv

In [None]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# loading the file
in_filename = 'sequences.txt'
doc = load_doc(in_filename)
# creating lines from document 
lines = doc.split('\n')
 
# encoding unique words in categories
tokenizer = Tokenizer()
# creating tokens of words in lines
tokenizer.fit_on_texts(lines)
# creating sequences as numeric array 
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size is the total unique words
vocab_size = len(tokenizer.word_index) + 1
 
# separate into input and output
# converting sequences into array
sequences = array(sequences)
# taking all the previous words a input and last word as output
X, y = sequences[:,:-1], sequences[:,-1]
# converting Y to categorical which will give us the y as encoded value from vocab size
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]
 
# defining the  model
model = Sequential()
# embedding layer to take the whole vocab as input and give output of sequence length and total words that is 50
model.add(Embedding(vocab_size, 50, input_length=seq_length))
# adding LSTM layer return sequences has to be true to add another LSTM layer
model.add(LSTM(250, return_sequences=True))
# adding another LSTM layer
model.add(LSTM(100))
# adding a hidden layer
model.add(Dense(100, activation='relu'))
# output layer with whole vocabulary
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compiling model with categorical crossentopy loss and adam optimiser
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fitting the model
model.fit(X, y, batch_size=128, epochs=250)
 
# saving the model as a h5 file
model.save('model.h5')
# saving the tokenizer as a pickle file
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            295850    
                                                                 
 lstm (LSTM)                 (None, 50, 250)           301000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               140400    
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 5917)              597617    
                                                                 
Total params: 1,344,967
Trainable params: 1,344,967
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch

In [None]:
# to generate randon integers
from random import randint
# to load the pickle file
from pickle import load
# loading the model from h5
from keras.models import load_model
# to provide padding for short sentences
from keras.preprocessing.sequence import pad_sequences
# to process the array
import numpy as np

# opening the file and returning it as text
def load_doc(filename):
  # opening the file in read only mode
  file= open(filename,'r')
  # extracting the text from the file
  text=file.read()
  # closing the file
  file.close()
  # returning the text
  return text

# function to generate sentences
def generate_seq(model, tokenizer,seq_length,seed_text,n_words):
  # creating a list for result
  result= list()
  # taking the input text as seed text
  in_text=seed_text
  # for the length of the number of words to be genrated 
  for _ in range(n_words):
    # encoding the words as sequences
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # padding the rest of the words
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # predicting the numpy argumant for the input 
    yhat = np.argmax(model.predict(encoded), axis=-1)
    # initialising the output sequence
    out_word = ''
    # converting category back to word 
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    # appending the text that is generated
    in_text += ' ' + out_word
    # forming the result
    result.append(out_word)
  # returning the result
  return ' '.join(result)

# providing the input file
in_filename = 'sequences.txt'
# loading the document
doc = load_doc(in_filename)
#splitting it into lines
lines = doc.split('\n')
# creating the sequence length
seq_length = len(lines[0].split()) - 1

#loading the predictive model
model = load_model('model.h5')
 
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
 
# select a seed text
#seed_text = lines[randint(0,len(lines))]
#if you want to provide your own input just uncomment the below line and put your own input inside the ''.
seed_text = 'I do not know whether Papa guessed my feelings on the subject, probably not, and in any case he would not have been interested. The opinion of other people never interested him in the slightest degree. I think it was really a sign of his greatness. In the same way, he lived quite detached from the necessities of daily life. He ate what was put before him in an exemplary fashion, but seemed mildly pained when the question of paying'
print(seed_text + '\n')
 
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print("generated text:",generated)

I do not know whether Papa guessed my feelings on the subject, probably not, and in any case he would not have been interested. The opinion of other people never interested him in the slightest degree. I think it was really a sign of his greatness. In the same way, he lived quite detached from the necessities of daily life. He ate what was put before him in an exemplary fashion, but seemed mildly pained when the question of paying

generated text: there poirot only a yacht he is not there is it of my friend returned poirot wheeled to proceed to her with us asked he was half about within the day he handed it with me a little troubled a tall visitors were stolen and a bit different which he
