In [1]:
import pandas as pd
import pickle

df = pd.read_csv('conversation.csv', index_col=0)
#print(df.head())
#print(df.info) #[3725 rows x 2 columns]

#Remove ã‚â€” and replace with ' '. UTF8 encoding issue in data
df["cleaned_question"] = df["question"].str.replace('Ã‚â€”' ,' ')
df["cleaned_answer"] = df["answer"].str.replace('Ã‚â€”' ,' ')

#Convert question and answer to a list of tuple pairs
pairs = []
for index, row in df.iterrows():
    tuple = (row["cleaned_question"], row["cleaned_answer"])
    pairs.append(tuple)

#Serialize the list of pairs and save it to a pickle file (for use in app.py)
with open('pairs.pkl', 'wb') as f:
    pickle.dump(pairs, f)

In [2]:
import numpy as np
import re

#Initialize empty lists to hold sentences
input_docs = []
target_docs = []
#Initialize empty sets for vocabulary
input_tokens = set()
target_tokens = set()

for line in pairs[:1000]:
  input_doc, target_doc = line[0], line[1]
  #Append input sentence to input_docs
  input_docs.append(input_doc)
  #Remove punctuation and add <START> and <END> tags to target_doc
  target_doc = " ".join([word for word in re.findall(r"[\w']+", target_doc)])
  target_doc = '<START> ' + target_doc + ' <END>'
  target_docs.append(target_doc)

  #Split up each sentence into words and add each unique word to vocabulary set
  for token in re.findall(r"[\w']+", input_doc):
    if any(char.isalnum() for char in token):
      if token not in input_tokens:
        input_tokens.add(token)
  for token in target_doc.split():
    if any(char.isalnum() for char in token):
      if token not in target_tokens:
        target_tokens.add(token)

input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

#Create num_encoder_tokens and num_decoder_tokens:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

#Find maximum sequence lengths
max_encoder_seq_length = max([len(re.findall(r"[\w']+", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+", target_doc)) for target_doc in target_docs])

#Create dictionaries for token-to-index mapping
input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

#Initialize arrays for encoder and decoder input and output data
encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

#Populate encoder_input_data, decoder_input_data, and decoder_target_data
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
  for timestep, token in enumerate(re.findall(r"[\w']+", input_doc)):
    encoder_input_data[line, timestep, input_features_dict[token]] = 1.
  for timestep, token in enumerate(target_doc.split()):
    decoder_input_data[line, timestep, target_features_dict[token]] = 1.
    if timestep > 0:
      decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.

In [3]:
from tensorflow import keras
from keras.layers import Input, LSTM, Dense, Masking
from keras.models import Model

dimensionality = 256
batch_size = 50
epochs = 1000

#Encoder training setup
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(dimensionality, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

#Decoder training setup:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(dimensionality, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

#Build the training model:
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

#Compile the model:
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'], sample_weight_mode='temporal')

print("Training the model:\n")
#Train the model:
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)

training_model.save('training_model.h5') #save for use in .py file

#Display the model summary
training_model.summary()

Training the model:

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71

  saving_api.save_model(


# New section