## Build and Evaluate Sequence-to-Sequence (Encoder-Decoder) RNN to Classify Responses as Chatbot vs. Human

In [13]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU
import pickle
import numpy as np
from sklearn import metrics

In [14]:
# load training data
file_name = 'data/x_r_full_tur4_shuf.pkl'
file_obj = open(file_name,'rb') 
x_vec = pickle.load(file_obj)   
file_obj.close()
file_name = 'data/y_in_tur4_shuf.pkl'
file_obj = open(file_name,'rb') 
y_in_vec = pickle.load(file_obj)   
file_obj.close()
file_name = 'data/y_tar_tur4_shuf.pkl'
file_obj = open(file_name,'rb') 
y_tar_vec = pickle.load(file_obj)   
file_obj.close()
file_name = 'data/wrd_vec4.pkl'
file_obj = open(file_name,'rb') 
wrd_vec = pickle.load(file_obj)   
file_obj.close()

In [15]:
# length of GLoVe word encoding vectors
num_encoder_tokens = 50
num_decoder_tokens = 50

In [16]:
encoder_input_data = np.asarray(x_vec, dtype=float)
decoder_input_data = np.asarray(y_in_vec, dtype=float)
decoder_target_data = np.asarray(y_tar_vec, dtype=float)

In [17]:
del x_vec, y_in_vec, y_tar_vec, 

In [18]:
# by design, max dialog is 60 words; decoder also has appended start/stop word for input/target sequences
max_encoder_seq_length = 60
max_decoder_seq_length = 61

In [19]:
# model parameters
batch_size = 64
epochs = 3
latent_dim = 256

In [20]:
print('Number of samples:', len(encoder_input_data))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)
print('Batch size:',batch_size)
print('Number of epochs:',epochs)
print('Hidden layer size:',latent_dim)

Number of samples: 40000
Number of unique input tokens: 50
Number of unique output tokens: 50
Max sequence length for inputs: 60
Max sequence length for outputs: 61
Batch size: 64
Number of epochs: 3
Hidden layer size: 256


In [21]:
input_token_index = wrd_vec
target_token_index = {'hum':[0,1], 'bot':[1,0]}

In [22]:
# Encoder model
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)  # LSTM memory for hidden layer
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# Only encoder states are relevant to seq2seq model
encoder_states = [state_h, state_c]

# Decoder model conditioned on final encoder state
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=False, return_state=True)  # LSTM memory for hidden layer
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(2, activation='softmax')  # 2x softmax outputs for binary classification human/bot channel
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
# Assign structure to a model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
# SGD using Adam optimization and cross-entropy loss 
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size, epochs=epochs, validation_split = 0.2)
# based on trial # epochs, validation loss profile suggests early stopping at 3 epochs to avoid overfitting

Train on 32000 samples, validate on 8000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c89481bd30>

In [25]:
del encoder_input_data, decoder_input_data, decoder_target_data

In [26]:
# load test data
file_name = 'data/x_r_full_ts_tur4_shuf.pkl'
file_obj = open(file_name,'rb') 
x_vec = pickle.load(file_obj)   
file_obj.close()
file_name = 'data/y_in_ts_tur4_shuf.pkl'
file_obj = open(file_name,'rb') 
y_in_vec = pickle.load(file_obj)   
file_obj.close()
file_name = 'data/y_tar_ts_tur4_shuf.pkl'
file_obj = open(file_name,'rb') 
y_tar_vec = pickle.load(file_obj)   
file_obj.close()

In [27]:
encoder_input_data = np.asarray(x_vec, dtype=float)
decoder_input_data = np.asarray(y_in_vec, dtype=float)
decoder_target_data = np.asarray(y_tar_vec, dtype=float)

In [28]:
del x_vec, y_in_vec, y_tar_vec

In [35]:
turing_out = model.predict([encoder_input_data, decoder_input_data])
turing_score = model.evaluate([encoder_input_data, decoder_input_data], decoder_target_data)



In [36]:
print('Turing loss performance: {:4.3f}'.format(turing_score[0]))
print('Turing acc performance: {:6.3f}'.format(turing_score[1]))

Turing loss performance: 0.012
Turing acc performance:  0.998


In [39]:
y_test_class = decoder_target_data[:,1]
y_pred_class = []
for i in range(len(turing_out)):
    y_pred_class.append(round(turing_out[i,1]))

In [40]:
# evaluate metrics: confusion metrics, recall, precision, F1 score
confusion_turing = metrics.confusion_matrix(y_test_class, y_pred_class)
recall_turing = metrics.recall_score(y_test_class, y_pred_class)
precision_turing = metrics.precision_score(y_test_class, y_pred_class)
F1_turing = metrics.f1_score(y_test_class, y_pred_class)

In [41]:
print('Confusion Matrix:')
print(confusion_turing)
print()
print('Turing recall: {:8.3f}'.format(recall_turing))
print('Turing precision: {:5.3f}'.format(precision_turing))
print('Turing F1 score: {:6.3f}'.format(F1_turing))

Confusion Matrix:
[[20020     0]
 [   70 19910]]

Turing recall:    0.996
Turing precision: 1.000
Turing F1 score:  0.998


## seq2seq chatbot detection shows very strong performance characteristics for this data.  The present bot data does not reflect expected reality, but as a proof of concept the process suggests viability of the method