# Imports

In [None]:
from nltk.util import ngrams
from nltk import word_tokenize, sent_tokenize
import re, os, math, random, datetime
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout
from keras.layers import LSTM
from keras.optimizers import adam
from keras.losses import mean_absolute_error
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

# Variables

In [None]:
sequence_length = 20
embedding_vector_len=1000
dataset_dir='dataset'

# Read files

In [None]:
files = [os.path.join(dataset_dir, file) for file in os.listdir(dataset_dir) if file.endswith('.txt')]
raw_text = " ".join([open(file).read() for file in files])
no_of_files = len(os.listdir(dataset_dir))
print("Read {} textfiles." .format(no_of_files))

# Preparing words and sentences

In [None]:
sentences = [word_tokenize(sent) for sent in sent_tokenize(raw_text)]
print("Total {} number of sentences." .format(len(sentences)))
all_words = []
for sent in sentences:
    all_words += sent

# Word embedding

In [None]:
word2vec = Word2Vec(sentences, min_count=1, size=embedding_vector_len)
new_vocab = word2vec.wv.vocab
print("Vocab size {}." .format(len(new_vocab)))
vocab_len=len(new_vocab)

# Preparing Inputs and Outputs

In [None]:
all_input_seqs = list(ngrams(all_words, sequence_length))
print("Total number of sequences: {} with each sequence lengthing: {}." .format(len(all_input_seqs), sequence_length))

In [None]:
all_output_words = all_words[sequence_length:]

### Shuffle inputs

In [None]:
#zipping inputs and targets for consistent shuffling
all_merged = list(zip(all_input_seqs, all_output_words))
random.shuffle(all_merged)
del(all_input_seqs)
del(all_output_words)

### Break inputs into batches

In [None]:
iteration_pass = no_of_files
seq_len_per_pass = len(all_merged) // iteration_pass
merged_sections = [ all_merged[i:i + seq_len_per_pass] for i in range(0, seq_len_per_pass * iteration_pass, seq_len_per_pass) ] 

# Model Tuners

In [None]:
learning_rate = 0.1
optimizer = adam(lr=learning_rate)
loss_function = mean_absolute_error

num_memory_units_1 = embedding_vector_len // 2
num_memory_units_2 = embedding_vector_len // 4

num_iterations = 30
batch_size = 128
words_to_generate = 300

# Model Architecture

In [None]:
model = Sequential(name="NSP")

model.add(LSTM(num_memory_units_1, name='1st_LSTM_layer', return_sequences=True, input_shape=(sequence_length, embedding_vector_len)))
model.add(LSTM(num_memory_units_2, name='2nd_LSTM_layer'))

model.add(Dropout(0.2, name='1st_Dropout_layer'))
model.add(Dense(embedding_vector_len, name='1st_Dense_layer'))
model.add(Dropout(0.2, name='2nd_Dropout_layer'))
model.add(Dense(embedding_vector_len, name='2nd_Dense_layer'))
model.add(Activation('softmax', name='Activation_layer'))

model.compile(loss=loss_function, optimizer=optimizer)
model.summary()

## Checkpoints

In [None]:
filepath=datetime.datetime.now().strftime('saved_models/%d-%m-%y %H:%M:%S')+".hdf5"
# checkpoint = ModelCheckpoint(filepath, verbose=1, save_frequency=5)
# callbacks_list = [checkpoint]

# Training

In [None]:
section = 1
for merged_section in merged_sections:
    
    # unzip input and target
    input_seqs, output_words = zip(*merged_section)
    
    x = np.stack([word2vec[seq] for seq in input_seqs], axis=0)
    y = word2vec[output_words]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=22)

    print(str("\n"+"*"*20+"\nRunning Secion: {}\n"+"*"*20) .format(section))
    
    # train the model, output generated text after each iteration
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=num_iterations, validation_data=(x_test, y_test))
          
    print(str("\n"+"*"*20+"\nRunning Secion: {}\n"+"*"*20) .format(section))
    
    section = section + 1

In [None]:
model.save('saved_models/final.hdf5')