<a href="https://colab.research.google.com/github/siddeshnaik/My_Captain_ML/blob/master/Text_generation_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing dependance
import numpy as np
import pandas as pd
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import nltk
nltk.download("stopwords")

In [None]:
#load data
# loading data and opening our input data in the form of a txt file
file = open(r"/content/frankenstein.txt").read()

In [None]:
#tokenization
#standardization
def tokenizer_words(input):
    input=input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words("english"), tokens)
    return " ".join(filtered)

processed_inputs = tokenizer_words(file)

In [None]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [None]:
# check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:",vocab_len)

In [None]:
seq_length = 100
x_data = []
y_data = []

In [None]:
#loop through the sequence
for i in range(0, input_len - seq_length,1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print("total patterns:", n_patterns)

In [None]:
#convert input sequence to np array and so on
X = np.reshape(x_data, (n_patterns, seq_length,1))
X = X/float(vocab_len)

In [None]:
# one hot encoding
y = np_utils.to_categorical(y_data)

In [None]:
# creating the model
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation="softmax"))

In [None]:
# compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
# saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor= "loss", verbose= 1, save_best_only=True, mode="min")
desired_callbacks = (checkpoint)

In [None]:
# fit model and let it train
model.fit(X,y, epochs=100, batch_size=1000, callbacks=desired_callbacks)

In [None]:
# recompile model with the saved weights 
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
# output if the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [None]:
# random seed to help generate
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed: ")
print("\"",''.join([num_to_char[value] for value in pattern]), "\"")


In [None]:
# generate the text
for i in range(1000):
    x = np.reshape(pattern,(1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)] 