**MODELLING**

In [1]:
import numpy as np 
import pandas as pd 
import os

In [2]:
import tensorflow as tf
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K



In [7]:
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
    def build(self, input_shape):
        assert isinstance(input_shape, list)
        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)
        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end



In [20]:
    def call(self, inputs, verbose=False):
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)
        def energy_step(inputs, states):
            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)
            if verbose:
                print('Ws+Uh>', Ws_plus_Uh.shape)
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)
            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]
        def context_step(inputs, states):

            """ Step function for computing ci using ei """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
              energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs


In [21]:
def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [23]:
import re
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
convers = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [24]:
len(lines)

304714

**DATA PRE-PROCESSING**

In [25]:
exchn = []
for conver in convers:
    exchn.append(conver.split(' +++$+++ ')[-1][1:-1].replace("'", " ").replace(",","").split())

In [26]:
diag = {}
for line in lines:
    diag[line.split(' +++$+++ ')[0]] = line.split(' +++$+++ ')[-1]

In [27]:
del(lines, convers, conver, line)

**CREATING LIST OF QUESTIONS AND ANSWERS**

In [28]:
questions = []
answers = []

In [29]:
for conver in exchn:
    for i in range(len(conver) - 1):
        questions.append(diag[conver[i]])
        answers.append(diag[conver[i+1]])

In [30]:
del(diag, exchn, conver, i)

In [31]:
sorted_ques = []
sorted_ans = []

In [32]:
for i in range(len(questions)):
    if len(questions[i]) < 13:
        sorted_ques.append(questions[i])
        sorted_ans.append(answers[i])

**CLEANING OF DATASET**

In [33]:
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r"i'm", "i am", txt)
    txt = re.sub(r"he's", "he is", txt)
    txt = re.sub(r"she's", "she is", txt)
    txt = re.sub(r"that's", "that is", txt)
    txt = re.sub(r"what's", "what is", txt)
    txt = re.sub(r"where's", "where is", txt)
    txt = re.sub(r"\'ll", " will", txt)
    txt = re.sub(r"\'ve", " have", txt)
    txt = re.sub(r"\'re", " are", txt)
    txt = re.sub(r"\'d", " would", txt)
    txt = re.sub(r"won't", "will not", txt)
    txt = re.sub(r"can't", "can not", txt)
    txt = re.sub(r"[^\w\s]", "", txt)
    return txt

In [34]:
clean_ques = []
clean_ans = []

In [35]:
for line in sorted_ques:
    clean_ques.append(clean_text(line))
        
for line in sorted_ans:
    clean_ans.append(clean_text(line))

In [36]:
del(answers, questions, line)

In [37]:
for i in range(len(clean_ans)):
    clean_ans[i] = ' '.join(clean_ans[i].split()[:11])

In [38]:
del(sorted_ans, sorted_ques)

In [39]:
clean_ans=clean_ans[:30000]
clean_ques=clean_ques[:30000]

**CREATING VOCABULARY**

In [40]:
word2count = {}

In [41]:
for line in clean_ques:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [42]:
for line in clean_ans:
    for word in line.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [43]:
del(word, line)

In [44]:
thresh = 5

In [45]:
vocab = {}
word_num = 0

In [46]:
for word, count in word2count.items():
    if count >= thresh:
        vocab[word] = word_num
        word_num += 1

In [47]:
del(word2count, word, count, thresh)       
del(word_num)        

**ADDING SOS AND EOS**

In [48]:
for i in range(len(clean_ans)):
    clean_ans[i] = '<SOS> ' + clean_ans[i] + ' <EOS>'

In [49]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
x = len(vocab)
for token in tokens:
    vocab[token] = x
    x += 1

In [50]:
vocab['cameron'] = vocab['<PAD>']
vocab['<PAD>'] = 0

In [51]:
del(token, tokens) 
del(x)

In [52]:
inv_vocab = {w:v for v, w in vocab.items()}

In [53]:
del(i)

**CREATING ENCODER AND DECODER INPUTS**

In [56]:
encoder_inp = []
for line in clean_ques:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])
        
    encoder_inp.append(lst)

In [57]:
decoder_inp = []
for line in clean_ans:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])        
    decoder_inp.append(lst)

In [58]:
del(clean_ans, clean_ques, line, lst, word)

In [59]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [60]:
encoder_inp = pad_sequences(encoder_inp, 13, padding='post', truncating='post')
decoder_inp = pad_sequences(decoder_inp, 13, padding='post', truncating='post')

In [61]:
decoder_final_output = []
for i in decoder_inp:
    decoder_final_output.append(i[1:]) 

In [62]:
decoder_final_output = pad_sequences(decoder_final_output, 13, padding='post', truncating='post')

In [63]:
del(i)

In [64]:
VOCAB_SIZE = len(vocab)
MAX_LEN = 13

In [65]:
print(decoder_final_output.shape, decoder_inp.shape, encoder_inp.shape, len(vocab), len(inv_vocab), inv_vocab[0])

(30000, 13) (30000, 13) (30000, 13) 3027 3027 <PAD>


In [66]:
inv_vocab[16]

'they'

In [73]:
decoder_final_input = []
print(len(decoder_final_input), MAX_LEN, VOCAB_SIZE)
decoder_output_data = np.zeros((len(decoder_final_input), MAX_LEN, VOCAB_SIZE), dtype="float32")
print(decoder_output_data.shape)

0 13 3027
(0, 13, 3027)


In [74]:
from tensorflow.keras.utils import to_categorical
decoder_final_output = to_categorical(decoder_final_output, len(vocab))

In [75]:
decoder_final_output.shape

(30000, 13, 3027)

**GLOVE EMBEDDING**

In [76]:
embeddings_index = {}

In [77]:
with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

print("Glove Loded!")

Glove Loded!


In [78]:
embedding_dimention = 50
def embedding_matrix_creater(embedding_dimention, word_index):
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dimention))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
embedding_matrix = embedding_matrix_creater(50, word_index=vocab)    

In [79]:
del(embeddings_index)

In [80]:
embedding_matrix.shape

(3028, 50)

In [81]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

**MODEL - LSTM**

In [82]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Bidirectional, Concatenate, Dropout, Attention

In [83]:
embed = Embedding(VOCAB_SIZE+1, 
                  50, 
                  
                  input_length=13,
                  trainable=True)

embed.build((None,))
embed.set_weights([embedding_matrix])

In [135]:
enc_inp = Input(shape=(13, ))

In [85]:
enc_embed = embed(enc_inp)
enc_lstm = Bidirectional(LSTM(400, return_state=True, dropout=0.05, return_sequences = True))

In [86]:
encoder_outputs, forward_h, forward_c, backward_h, backward_c = enc_lstm(enc_embed)

In [87]:
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

In [88]:
enc_states = [state_h, state_c]

In [134]:
dec_inp = Input(shape=(13, ))
dec_embed = embed(dec_inp)
dec_lstm = LSTM(400*2, return_state=True, return_sequences=True, dropout=0.05)
output, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

In [161]:
attn_layer = AttentionLayer()
attn_op, attn_state = attn_layer([encoder_outputs, output])
decoder_concat_input = keras.layers.Concatenate(axis=-1)([output, attn_op])

In [91]:
dec_dense = Dense(VOCAB_SIZE, activation='softmax')
final_output = dec_dense(decoder_concat_input)

In [92]:
model = Model([enc_inp, dec_inp], final_output)

In [93]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 13)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 13)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 13, 50)       151400      ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  [(None, 13, 800),    1443200     ['embedding[0][0]']          

In [94]:
import keras
import tensorflow as tf

In [95]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [96]:
model.fit([encoder_inp, decoder_inp], decoder_final_output, epochs=40, batch_size=24, validation_split=0.15)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1d39237b340>

**INFERECE**

In [97]:
#INFERECE
model.save('chatbot.h5')
model.save_weights('chatbot_weights.h5')

In [101]:
#Attention inference
enc_model = tf.keras.models.Model(enc_inp, [encoder_outputs, enc_states])

In [100]:
decoder_state_input_h = tf.keras.layers.Input(shape=( 400 * 2,))
decoder_state_input_c = tf.keras.layers.Input(shape=( 400 * 2,))

In [102]:
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [103]:
decoder_outputs, state_h, state_c = dec_lstm(dec_embed , initial_state=decoder_states_inputs)

In [104]:
decoder_states = [state_h, state_c]

In [105]:
dec_model = tf.keras.models.Model([dec_inp, decoder_states_inputs],
                                      [decoder_outputs] + decoder_states)

In [107]:
from keras_preprocessing.sequence import pad_sequences
print("##########################################")
print("#       start chatting ver. 1.0          #")
print("##########################################")

##########################################
#       start chatting ver. 1.0          #
##########################################


In [172]:
prepro1 = input("you : ")
prepro1 = clean_text(prepro1)
prepro = [prepro1]

txt = []
for x in prepro:
    lst = []
    for y in x.split():
        try:
            lst.append(vocab[y])
        except:
            lst.append(vocab['<OUT>'])
    txt.append(lst)
txt = pad_sequences(txt, 13, padding='post')


###
enc_op, stat = enc_model.predict( txt )

empty_target_seq = np.zeros( ( 1 , 1) )
empty_target_seq[0, 0] = vocab['<SOS>']
stop_condition = False
decoded_translation = ''


while not stop_condition :
    
    dec_inp = Input(shape=(13, 1, ))
    enc_inp = Input(shape=(13, 1, ))

    dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + stat )
    ###
    ###########################
    #attn_op, attn_state = attn_layer([enc_op, dec_outputs])
    #decoder_concat_input = Concatenate(axis=-1)([dec_outputs, attn_op])
    #decoder_concat_input = dec_dense(decoder_concat_input)
    ###########################

    sampled_word_index = np.argmax( dec_outputs[0, -1, :] )

    sampled_word = inv_vocab[sampled_word_index] + ' '

    if sampled_word != '<EOS> ':
        decoded_translation += sampled_word           


    if sampled_word == '<EOS> ' or len(decoded_translation.split()) > 13:
        stop_condition = True

    empty_target_seq = np.zeros( ( 1 , 1 ) )  
    empty_target_seq[ 0 , 0 ] = sampled_word_index
    stat = [ h , c ] 

print("chatbot attention : ", decoded_translation )
print("==============================================")

you : hey
chatbot attention :  remember plans female any credit offense watch john john john john want john john 


**CONCLUSION**

**I use seq2seq model for this project and  got the accuaracy as 0.89 which is quite good one.I tried to build attention mechanishm in this project but about 70% work very well the other part will be done as the future modifications.**