In [None]:
import ruggero_detector as rd
import string
import numpy as np
import matplotlib.pyplot as plt

# Keras imports
from keras.layers import multiply, Input, Dense
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
import keras.backend as K

# 2019-02-14 Ruggero detector again
Once we established that it's difficult to extract meaningful information from attention layers before or after LSTMs, one remaining question is how to extract some information from it at all. LSTM layers are made for processing data one element at a time. So maybe we can use this fact to build a Ruggero detector that outputs a probability of having read "Ruggero" at each letter passed to the sequence.

In [None]:
# build our data set
ntrain = 10000
nvalid = 2000
ntest = 2000
sentence_length = 80
train_data, train_targets,\
valid_data, valid_targets,\
test_data, test_targets = rd.prepare_data(sentence_length, ntrain, nvalid, ntest, rd.alpha_to_n)

In [None]:
# convert to categorical
train_data_cat = to_categorical(train_data)
valid_data_cat = to_categorical(valid_data)
test_data_cat = to_categorical(test_data)

This time I will try to build a Ruggero-detector that has only an LSTM and an output neuron. Let's see what happens.

In [None]:
model = Sequential()

# LSTM layer
lstm_units = 32
model.add(LSTM(lstm_units, return_sequences=False, input_shape=(None, rd.nletters)))

# output layer
model.add(Dense(1, activation='sigmoid'))

# compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# define checkpointer and fit the model
checkpointer = ModelCheckpoint(filepath='../data/ruggero-detector.hdf5', 
                               verbose=1, save_best_only=True)
model.fit(train_data_cat, train_targets,
          batch_size=32,
          epochs=2,
          validation_data=(valid_data_cat, valid_targets),
          callbacks=[checkpointer])

The accuracy of the model is still very high. A key difference here is that I built the network *without specifying how long are the input sequences*. Therefore, now I can feed whatever I want to the network, and it will output the probability.

To test what the network is now able to detect, I want to look at the prediction of the network as a function of where we are in the string.

In [None]:
sentence = 'nilniarnoianciasnggerosnmidnsiunesiukljnedlvkjnsldkjjertninenjiinruruldjfnblsnbitn'
sentence_encoded = rd.encode_sentence(sentence, alpha_to_n=rd.alpha_to_n)
test = to_categorical(sentence_encoded, num_classes=rd.nletters)

# iterate
ranges = range(2, len(sentence))
N = len(ranges)
predictions = np.zeros(N)
for i in ranges:
    predictions[i-2] = model.predict(np.expand_dims(test[:i, :], axis=0))
    
fig = plt.figure(figsize=(15,3))
plt.plot(ranges, predictions)
plt.xticks(np.arange(len(sentence)), sentence)
plt.show()

This is a fantastic result. I wonder now if these results can improve even further if I provide noisy data as training data.

In [None]:
# build our data set
ntrain = 10000
nvalid = 2000
ntest = 2000
sentence_length = 80
noise=2.0
train_noisy_data, train_noisy_targets,\
valid_noisy_data, valid_noisy_targets,\
test_noisy_data, test_noisy_targets =\
rd.prepare_data(sentence_length, ntrain, nvalid, ntest, rd.alpha_to_n, noise=noise)

# convert to categorical
train_noisy_data_cat = to_categorical(train_noisy_data)
valid_noisy_data_cat = to_categorical(valid_noisy_data)
test_noisy_data_cat = to_categorical(test_noisy_data)

In [None]:
model_noisy = Sequential()

# LSTM layer
lstm_units = 32
model_noisy.add(LSTM(lstm_units, return_sequences=False, input_shape=(None, rd.nletters)))

# output layer
model_noisy.add(Dense(1, activation='sigmoid'))

# compile
model_noisy.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# define checkpointer and fit the model
checkpointer = ModelCheckpoint(filepath='../data/ruggero-detector.hdf5', 
                               verbose=1, save_best_only=True)
model_noisy.fit(train_noisy_data_cat, train_noisy_targets,
          batch_size=32,
          epochs=10,
          validation_data=(valid_noisy_data_cat, valid_noisy_targets),
          callbacks=[checkpointer])

In [None]:
def network_response(m, data) :
    """
    Outputs the response of the network as a function of the position in the sentence.
    To speed up things, it passes series of data points in parallel.
    """
    # get info on the data that was passed
    nsentences = data.shape[0]
    sentence_length = data.shape[1]
    N = sentence_length-1
    
    # init the output data structure
    predictions = np.zeros((nsentences, N))
    for i in range(1, sentence_length) :
        batch = data[:, :i]
        p = m.predict(batch)
        predictions[:, i-1] = p.squeeze()

    return predictions

In [None]:
def encode_batch_sentences(sentences, alpha_to_n) :
    # prepare the output data structure
    nletters = len(alpha_to_n)
    nsentences = len(sentences)
    sentence_length = len(sentences[0])
    sentences_encoded = np.zeros((nsentences, sentence_length, nletters))
    
    # encode them
    for i, sentence in enumerate(sentences) :
        s = rd.encode_sentence(sentence, alpha_to_n=rd.alpha_to_n)
        sentences_encoded[i,:,:] = to_categorical(s, num_classes=nletters)
    
    # and return
    return sentences_encoded

In [None]:
# define target sentences
sentence_with_target = 'nilniarnoianciasnmidruggeronsiunesiukljnedlvkjnsldkjjertninenjiinldjfnblsnbitn'
sentence_without_target = 'nilniarnoianciasnmidrqrotbnznsiunesukljnedlvkjnsldkjjertninenjiinldjfnblsnbitn'

# get the network predictions for the two sentences
sentences_encoded = encode_batch_sentences([sentence_without_target, sentence_with_target],
                                           alpha_to_n=rd.alpha_to_n)
predictions = network_response(model_noisy, sentences_encoded)

# plot the network response in the two cases
fig, axes = plt.subplots(2, 1, figsize=(15,6))
axes[0].plot(range(1, len(sentence_without_target)), predictions[0])
axes[0].set_xticks(np.arange(len(sentence_without_target)))
axes[0].set_xticklabels(sentence_without_target)
axes[0].set_ylabel("Response")

axes[1].plot(range(1, len(sentence_with_target)), predictions[1])
axes[1].set_xticks(np.arange(len(sentence_with_target)))
axes[1].set_xticklabels(sentence_with_target)
axes[1].set_ylabel("Response")

plt.show()

Okay, this finally works very well, showing that indeed the network responds clearly to the sequence that contains the target. It is important at this point to remind ourselves of the fact that the network never knew what the target was.

The next step in this journey is to try to identify the target by the response of the network.

In [None]:
plt.plot(np.ediff1d(prediction_with_target), color='b', label='With target')
plt.plot(np.ediff1d(prediction_without_target), color='r', label='Without target')
plt.legend(loc='upper right')
plt.xlabel('Position in sequence')
plt.ylabel('Response differential')
plt.show()

It's clear from this plot that we can try to figure out something about the target by looking at the sequence that's around the peak of the response differential. Let's try to explore this idea.

In [None]:
# extract the (categorical) test data 
test_with_target = test_noisy_data_cat[test_noisy_targets==1]
sentences_with_target = [rd.decode_sentence(s, n_to_alpha=rd.n_to_alpha)
                        for s in test_noisy_data[test_noisy_targets==1].squeeze()]

In [None]:
# extract the letter-by-letter predictions of our model
predictions = network_response(model_noisy, test_with_target)

In [None]:
# let's see whether this is working or not
nsentences, sentence_length, nletters = test_with_target.shape
i = 8
fig = plt.figure(figsize=(15,3))
plt.plot(range(1, sentence_length), predictions[i])
plt.xticks(np.arange(1, sentence_length), sentences_with_target[i])
plt.ylabel("Response")
plt.show()

In [None]:
maxdiff = np.diff(predictions, axis=1).argmax(axis=1)
l = 5
for i in range(100) :
    if maxdiff[i]-l<0 :
        m = 0
    else :
        m = maxdiff[i]-l
    if maxdiff[i]+l>=sentence_length :
        M = sentence_length
    else :
        M = maxdiff[i]+l
    s = sentences_with_target[i][m:M]
    print(s)

This stuff here shows that in most of the cases we are able to see that the target is present in these substrings that we extracted.

One other thing that I would like to investigate is whether with the same kind of strategy we can figure out something about multiple categories.