In [1]:
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from attention_decoder import AttentionDecoder
from nltk.stem import PorterStemmer
import string

import pandas as pd

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
ps = PorterStemmer()
df = pd.read_csv('test.csv')
df.head(10)

Unnamed: 0,question,answer
0,hello,HI
1,hey,Hello
2,hi,Hey
3,how are you?,"good, you?"
4,how is it going?,great
5,good,same here
6,great,that is good to hear
7,what color is the sky,blue
8,bye,bye
9,goodbye,goodbye


In [3]:
def return_sents(df_col1, df_col2):
    return [sent for sent in df.question] + [sent for sent in df.answer]

In [4]:
def return_unique_words(all_sents):
    table = str.maketrans({key: None for key in string.punctuation})
    all_words = [words.split() for words in all_sents] 
    word_list = [word.lower() for sublist in all_words for word in sublist]
    word_list = [word.translate(table) for word in word_list] 
#    word_list = [ps.stem(word) for word in word_list]
    return list(set(word_list))

In [5]:
def return_unique_words_single(sent):
    table = str.maketrans({key: None for key in string.punctuation})
    all_words = sent.split()
    word_list = [word.lower() for word in all_words]
    word_list = [word.translate(table) for word in word_list] 
#    word_list = [ps.stem(word) for word in word_list]
    return word_list

In [6]:
def df_to_df(df_col1, df_col2):
    all_sent = return_sents(df_col1, df_col2)
    word_list = return_unique_words(all_sent)
    word_list.insert(0, ' ')
    t_df = pd.DataFrame()
    t_df['word'] = word_list
    t_df['idx'] = t_df.index
    return t_df

In [7]:
t_df = df_to_df(df.question, df.answer)
t_df.head()

Unnamed: 0,word,idx
0,,0
1,question,1
2,married,2
3,lazy,3
4,enjoy,4


In [8]:
def word_to_array(sents, t_df):
    l = []
    l2 = []
    for sent in sents:
        b = []
        a = return_unique_words_single(sent)
        for w in a:
            try:
                b.append(t_df.loc[t_df.word == w, 'idx'].iloc[0])
            except:
                b.append(0)
        l.append(a)
        l2.append(b)
    return l, l2

In [9]:
q_list, q_as_array = word_to_array(df.question, t_df)
a_list, a_as_array = word_to_array(df.answer, t_df)

In [10]:
print('Question word list:\n', q_list[:10], '\n'*2,'Question array list:\n', q_as_array[:10], '\n')
print('Answer word list:\n', a_list[:10],'\n'*2, 'Answer array list:\n', a_as_array[:10],'\n')

Question word list:
 [['hello'], ['hey'], ['hi'], ['how', 'are', 'you'], ['how', 'is', 'it', 'going'], ['good'], ['great'], ['what', 'color', 'is', 'the', 'sky'], ['bye'], ['goodbye']] 

 Question array list:
 [[106], [61], [139], [27, 73, 104], [27, 91, 44, 59], [135], [34], [78, 134, 91, 28, 84], [63], [55]] 

Answer word list:
 [['hi'], ['hello'], ['hey'], ['good', 'you'], ['great'], ['same', 'here'], ['that', 'is', 'good', 'to', 'hear'], ['blue'], ['bye'], ['goodbye']] 

 Answer array list:
 [[139], [106], [61], [135, 104], [34], [120, 92], [12, 91, 135, 87, 146], [71], [63], [55]] 



In [11]:
vocab_size = len(t_df)
print('Vocab Size: ', vocab_size)

max_q_l = len(max(q_as_array,key=len))
max_a_l = len(max(a_as_array,key=len))
max_l = max(max_q_l, max_a_l)
print('Max Length of Question: ', max_q_l)
print('Max Length of Answer: ', max_a_l)

# pad documents to a max length + 1
max_length = max_l + 1
print('Max Padded Length: ', max_length)

Vocab Size:  147
Max Length of Question:  8
Max Length of Answer:  21
Max Padded Length:  22


In [12]:
padded_q_docs = pad_sequences(q_as_array, maxlen=max_length, padding='post')
print('Padded array:\n', padded_q_docs[:10])

Padded array:
 [[106   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 61   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [139   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 27  73 104   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 27  91  44  59   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [135   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 34   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 78 134  91  28  84   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 63   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 55   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


In [13]:
def array_to_string(ar):
    c = [t_df.loc[t_df.idx == i, 'word'].iloc[0] for i in ar]
    s = ' '.join(c)
    return s

In [14]:
padded_a_docs = pad_sequences(a_as_array, maxlen=max_length, padding='post')
print('Padded array:\n', padded_a_docs[:10])

Padded array:
 [[139   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [106   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 61   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [135 104   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 34   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [120  92   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 12  91 135  87 146   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 71   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 63   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [ 55   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


In [15]:
# one hot encode sequence
def one_hot_encode(sequence, n_unique):
    encoding = list()
    for value in sequence:
        vector = [0 for _ in range(n_unique)]
        vector[value] = 1
        encoding.append(vector)
    return array(encoding)

In [16]:
# decode a one hot encoded string
def one_hot_decode(encoded_seq):
    return [argmax(vector) for vector in encoded_seq]

In [17]:
# prepare data for the LSTM
def transform_xy(sequence_in, sequence_out, n_features):
    # one hot encode
    X = one_hot_encode(sequence_in, n_features)
    y = one_hot_encode(sequence_out, n_features)
    
    # reshape as 3D
    X = X.reshape((1, X.shape[0], X.shape[1]))
    y = y.reshape((1, y.shape[0], y.shape[1]))
    return X,y

In [18]:
n_features = vocab_size + 1

# define model
model = Sequential()
model.add(LSTM(150, input_shape=(max_length, n_features), return_sequences=True))
model.add(AttentionDecoder(150, n_features))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [19]:
for a in range(0, 99):
    for n in range(0, len(padded_q_docs)):
        # generate new random sequence
        X,y = transform_xy(padded_q_docs[n], padded_a_docs[n], n_features)
        
        # fit model for one epoch on this sequence
        model.fit(X, y, epochs=1, verbose=0)

In [20]:
for n in range(12, 20):
    X,y = transform_xy(padded_q_docs[n], padded_a_docs[n], n_features)
    yhat = model.predict(X, verbose=0)
    print('Question Array:', one_hot_decode(X[0]), '\nQuestion :', array_to_string(one_hot_decode(X[0])), '\n')
    print('Expected Res. Array:', one_hot_decode(y[0]), '\nExpected Res.:', array_to_string(one_hot_decode(y[0])), '\n')
    print('Predicted Res. Array:', one_hot_decode(yhat[0]), '\nPredicted Res.:', array_to_string(one_hot_decode(yhat[0])), '\n')
    print('\n')

Question Array: [78, 91, 100, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
Question : what is your name                                     

Expected Res. Array: [77, 66, 60, 104, 129, 104, 73, 19, 80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
Expected Res.: i cannot tell you because you are a stranger                           

Predicted Res. Array: [77, 66, 60, 104, 129, 104, 73, 19, 80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
Predicted Res.: i cannot tell you because you are a stranger                           



Question Array: [27, 32, 73, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
Question : how old are you                                     

Expected Res. Array: [12, 91, 130, 115, 104, 5, 19, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
Expected Res.: that is not something you ask a lady                             

Predicted Res. Array: [12, 91, 130, 115, 104, 5, 19, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
Predicted Res.: that is not

In [21]:
def sent_to_array(sent, t_df):
    a = []
    b = []
    a = return_unique_words_single(sent)
    for w in a:
        try:
            b.append(t_df.loc[t_df.word == w, 'idx'].iloc[0])
        except:
            b.append(0)
    return a, b

In [22]:
def transform_x(sequence_in, n_features):
    # one hot encode
    X = one_hot_encode(sequence_in, n_features)
    
    # reshape as 3D
    X = X.reshape((1, X.shape[0], X.shape[1]))
    return X

In [23]:
def get_response(sent, t_df):
    w, q = sent_to_array(sent, t_df)
    q_pad = pad_sequences([q], maxlen=max_length, padding='post')
    X2 = transform_x(q_pad[0], n_features)
    yhat2 = model.predict(X2, verbose=0)
    return array_to_string(one_hot_decode(yhat2[0]))

In [24]:
sent = "hello"
sent1 = "how goes it?"
sent2 = "What do you do all day?"
sent3 = "can you tell me your name?"
sent4 = "you're a mean robot"
sent5 = "bye"

In [25]:
print('User: ', sent, '\nRobot: ', get_response(sent, t_df))
print('User: ', sent1, '\nRobot: ', get_response(sent1, t_df))
print('User: ', sent2, '\nRobot: ', get_response(sent2, t_df))
print('User: ', sent3, '\nRobot: ', get_response(sent3, t_df))
print('User: ', sent4, '\nRobot: ', get_response(sent4, t_df))
print('User: ', sent5, '\nRobot: ', get_response(sent5, t_df))

User:  hello 
Robot:  hi                                          
User:  how goes it? 
Robot:  great                                          
User:  What do you do all day? 
Robot:  professional chatbox interesting                                      
User:  can you tell me your name? 
Robot:  i cannot tell you because you are a stranger                          
User:  you're a mean robot 
Robot:  well thats rude                                      
User:  bye 
Robot:  goodbye                                          
