In [1]:
import pandas as pd
import numpy as np
import re
import os
import tensorflow as tf
import nltk
import json

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import TensorBoard, LambdaCallback

from modeling.data import read_csv

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
paragraphs, headlines, max_input_len, max_output_len = read_csv('./data/SubjectsQuestionsAllExtended.csv', 'Text', 'Subject', 75)

  if self.run_code(code, result):


Examples number: 124671


In [3]:
def unique_chars (p_list):
    return list(set((''.join([''.join(set(p)) for p in p_list]))))

In [4]:
p_chars = unique_chars(paragraphs)
h_chars = unique_chars(headlines)
vocabulary = sorted(list(set(p_chars + h_chars)))

In [5]:
token_index = dict(
    [(char, i) for i, char in enumerate(vocabulary)])

In [6]:
reverse_token_index = dict(
    [(i, char) for char, i in token_index.items()])

In [7]:
##parametes
batch_size = 72
vocab_size = len(vocabulary)
latent_dim = 256
num_examples = len(paragraphs)

In [8]:
data = {}  
data['vocabulary'] = vocabulary
data['reverse_token_index'] = reverse_token_index
data['token_index'] = token_index
data['max_input_len'] = int(max_input_len)
data['max_output_len'] = int(max_output_len)
data['vocab_size'] = int(vocab_size)

with open('./data/hyper-test.json', 'w') as outfile:  
    json.dump(data, outfile)

In [9]:
class DataGenerator:
    def __init__(self, pars, heads):
        self.pars = pars
        self.heads = heads
        self.max = len(pars) - 1
        self.n = 0

    def __next__(self):
        start = self.n
        end = self.n + batch_size
        
        encoder_input_data = np.zeros(
            (batch_size, max_input_len, vocab_size),
            dtype='float32')
        decoder_input_data = np.zeros(
            (batch_size, max_output_len, vocab_size),
            dtype='float32')
        decoder_target_data = np.zeros(
            (batch_size, max_output_len, vocab_size),
            dtype='float32')
        
        batch_paragraths = self.pars[start: end]
        batch_headlines = self.heads[start: end]
        
        for i, (input_text, target_text) in enumerate(zip(batch_paragraths, batch_headlines)):
            input_len = len(input_text)
            target_len = len(target_text)
            for t in range(input_len):
                char = input_text[t]
                encoder_input_data[i, t, token_index[char]] = 1
            for t in range(target_len):
                char = target_text[t]
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[i, t, token_index[char]] = 1.
                if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                    decoder_target_data[i, t - 1, token_index[char]] = 1
                    
        self.n = self.n + 1
        
        if self.n > self.max:
            self.n = 0
        
        return ([encoder_input_data, decoder_input_data], decoder_target_data)

In [10]:
# split data into train\validation
print('total examples number:', num_examples)

split_index = int(num_examples * .95)

valid_pars = paragraphs[split_index:]
paragraphs = paragraphs[:split_index]

valid_heads = headlines[split_index:]
headlines = headlines[:split_index]

total examples number: 124671


In [11]:
train_generator = DataGenerator(paragraphs, headlines)
valid_generator = DataGenerator(valid_pars, valid_heads)

In [12]:
# # embedding data
# docs = DataSet['Subject'].append(DataSet['Text'])


In [13]:
# t = Tokenizer()
# t.fit_on_texts(docs.values)
# vocab_size = len(t.word_index) + 1
# # integer encode the documents
# encoded_docs = t.texts_to_sequences(docs)

In [14]:
# # load the whole embedding into memory
# embeddings_index = dict()
# f = open('./data/glove.6B.100d.txt')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

In [15]:
# embedding_matrix = zeros((vocab_size, 100))
# for word, i in t.word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [16]:
# embedding_vocab_size = len(t.word_index) + 1
# embedding = Embedding(embedding_vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)

In [17]:
##Build model

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, vocab_size))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, vocab_size))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

fname = './data/main_model_test_weights.h5'

if os.path.isfile(fname):
    model.load_weights(fname)

# Run training

adam_optimiser = Adam(lr=0.0016, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(optimizer=adam_optimiser, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 206)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 206)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 474112      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  474112      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [18]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [19]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, vocab_size))
    # Populate the first character of target sequence with the start character.
    #target_seq[0, 0, token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_token_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_output_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, vocab_size))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [20]:
def encode_string(text):
    encoder_input_data = np.zeros(
        (1, max_input_len, vocab_size),
        dtype='float32')
        
    input_len = len(text)
    
    for t in range(max_input_len):
        char = text[t] if t < input_len else ' '
        encoder_input_data[0, t, token_index[char]] = 1
        
    return encoder_input_data
def predict(text, truth):    
    input_seq = encode_string(text)
    decoded_sentence = decode_sequence(input_seq)
    
    print('-')
    print('Input sentence:', text)
    print('Ground truth:', truth)
    print('Decoded sentence:', decoded_sentence)
    
sample = vocabulary[0]
truth = headlines[0]

In [21]:
class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./logs', **kwargs):
        # Make the original `TensorBoard` log to a subdirectory 'training'
        training_log_dir = os.path.join(log_dir, 'training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

        # Log the validation metrics to a separate subdirectory
        self.val_log_dir = os.path.join(log_dir, 'validation')

    def set_model(self, model):
        # Setup writer for validation metrics
        self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def on_epoch_end(self, epoch, logs=None):
        # Pop the validation logs and handle them separately with
        # `self.val_writer`. Also rename the keys so that they can
        # be plotted on the same figure with the training metrics
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if k.startswith('val_')}
        for name, value in val_logs.items():
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value.item()
            summary_value.tag = name
            self.val_writer.add_summary(summary, epoch)
        self.val_writer.flush()

        # Pass the remaining logs to `TensorBoard.on_epoch_end`
        logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
        super(TrainValTensorBoard, self).on_epoch_end(epoch, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()

In [22]:
## Callbacks
def save_models ():
    encoder_model.save('./data/encoder-model-test.h5')
    decoder_model.save('./data/decoder-model-test.h5')
    model.save_weights(fname)
    print(predict(paragraphs[0], truth))
    
checkpoint = LambdaCallback(on_epoch_end=lambda epoch, logs: save_models())
tbCallBack = TrainValTensorBoard(log_dir='Graph-test', histogram_freq=0, write_graph=True, write_images=True)

In [None]:
history = model.fit_generator(train_generator,
                    steps_per_epoch=30,
                    epochs=50,
                    validation_data=valid_generator,
                    validation_steps=15,
                    callbacks=[checkpoint, tbCallBack])

Epoch 1/50

In [None]:
import matplotlib.pyplot as plt

# # Plot training & validation accuracy values
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# examples = train_generator.__next__()
# ([encoder_input_data, decoder_input_data], decoder_target_data) = examples
# for i in range(10):    
#     # Take one sequence (part of the training set)
#     # for trying out decoding.
    
#     input_seq = encoder_input_data[i: i + 1]
#     decoded_sentence = decode_sequence(input_seq)
#     print('-')
#     print('Input sentence:', paragraphs[train_generator.n + i])
#     print('Decoded sentence:', decoded_sentence)

In [None]:
for i in range(100, 120):
    predict(paragraphs[i], headlines[i])