# Imports

In [1]:
from nltk.util import ngrams
from nltk import word_tokenize, sent_tokenize
import re, os, math, random, datetime
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout
from keras.layers import LSTM
from keras.optimizers import adam
from keras.losses import mean_absolute_error
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Variables

In [2]:
sequence_length = 20
embedding_vector_len=1000
dataset_dir='dataset'

# Read files

In [3]:
files = [os.path.join(dataset_dir, file) for file in os.listdir(dataset_dir) if file.endswith('.txt')]
raw_text = " ".join([open(file).read() for file in files])
no_of_files = len(os.listdir(dataset_dir))
print("Read {} textfiles." .format(no_of_files))

Read 23 textfiles.


# Preparing words and sentences

In [4]:
sentences = [word_tokenize(sent) for sent in sent_tokenize(raw_text)]
print("Total {} number of sentences." .format(len(sentences)))
all_words = []
for sent in sentences:
    all_words += sent

Total 7114 number of sentences.


# Word embedding

In [5]:
word2vec = Word2Vec(sentences, min_count=1, size=embedding_vector_len)
new_vocab = word2vec.wv.vocab
print("Vocab size {}." .format(len(new_vocab)))
vocab_len=len(new_vocab)

Vocab size 12726.


# Preparing Inputs and Outputs

In [6]:
all_input_seqs = list(ngrams(all_words, sequence_length))
print("Total number of sequences: {} with each sequence lengthing: {}." .format(len(all_input_seqs), sequence_length))

Total number of sequences: 240183 with each sequence lengthing: 20.


In [7]:
all_output_words = all_words[sequence_length:]

### Shuffle inputs

In [8]:
#zipping inputs and targets for consistent shuffling
all_merged = list(zip(all_input_seqs, all_output_words))
random.shuffle(all_merged)
del(all_input_seqs)
del(all_output_words)

### Break inputs into batches

In [9]:
iteration_pass = no_of_files
seq_len_per_pass = len(all_merged) // iteration_pass
merged_sections = [ all_merged[i:i + seq_len_per_pass] for i in range(0, seq_len_per_pass * iteration_pass, seq_len_per_pass) ] 

# Model Tuners

In [10]:
learning_rate = 0.1
optimizer = adam(lr=learning_rate)
loss_function = mean_absolute_error

num_memory_units_1 = embedding_vector_len // 2
num_memory_units_2 = embedding_vector_len // 4

num_iterations = 30
batch_size = 128
words_to_generate = 300

# Model Architecture

In [11]:
model = Sequential(name="NSP")

model.add(LSTM(num_memory_units_1, name='1st_LSTM_layer', return_sequences=True, input_shape=(sequence_length, embedding_vector_len)))
model.add(LSTM(num_memory_units_2, name='2nd_LSTM_layer'))

model.add(Dropout(0.2, name='1st_Dropout_layer'))
model.add(Dense(embedding_vector_len, name='1st_Dense_layer'))
model.add(Dropout(0.2, name='2nd_Dropout_layer'))
model.add(Dense(embedding_vector_len, name='2nd_Dense_layer'))
model.add(Activation('softmax', name='Activation_layer'))

model.compile(loss=loss_function, optimizer=optimizer)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1st_LSTM_layer (LSTM)        (None, 20, 500)           3002000   
_________________________________________________________________
2nd_LSTM_layer (LSTM)        (None, 250)               751000    
_________________________________________________________________
1st_Dropout_layer (Dropout)  (None, 250)               0         
_________________________________________________________________
1st_Dense_layer (Dense)      (None, 1000)              251000    
_________________________________________________________________
2nd_Dropout_layer (Dropout)  (None, 1000)              0         
_________________________________________________________________
2nd_Dense_layer (Dense)      (None, 1000)              1001000   
_________________________________________________________________
Activation_layer (Activation (None, 1000)             

## Checkpoints

In [12]:
filepath=datetime.datetime.now().strftime('saved_models/%d-%m-%y %H:%M:%S')+".hdf5"
# checkpoint = ModelCheckpoint(filepath, verbose=1, save_frequency=5)
# callbacks_list = [checkpoint]

# Training

In [13]:
section = 1
for merged_section in merged_sections:
    
    # unzip input and target
    input_seqs, output_words = zip(*merged_section)
    
    x = np.stack([word2vec[seq] for seq in input_seqs], axis=0)
    y = word2vec[output_words]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=22)

    print(str("\n"+"*"*20+"\nRunning Secion: {}\n"+"*"*20) .format(section))
    
    # train the model, output generated text after each iteration
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=num_iterations, validation_data=(x_test, y_test))
          
    print(str("\n"+"*"*20+"\nRunning Secion: {}\n"+"*"*20) .format(section))
    
    section = section + 1

  import sys
  



********************
Running Secion: 1
********************

Train on 9397 samples, validate on 1045 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

KeyboardInterrupt: 

In [None]:
model.save('saved_models/final.hdf5')