This notebook creates, trains, and initializes a chatbot using word vectors. 

In [1]:

from os import path, name, system
from nltk.stem.lancaster import LancasterStemmer
from random import choice, randint
import numpy as np 
import tensorflow as tf
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk, pickle, json, re, string, tflearn, warnings
warnings.filterwarnings("ignore")


  return f(*args, **kwds)
Instructions for updating:
non-resource variables are not supported in the long term
curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [3]:
with open('cs_prompts.json') as file:
    data = json.load(file)

In [12]:
# Preprocess and format the training data
def preprocess_train_data(data):
    stemmer = LancasterStemmer()

    words = []
    labels = list(data.keys())
    docs_x = []
    docs_y = []

    for label in labels:
        for pattern in data[label]['patterns']:
            tokens = nltk.word_tokenize(pattern)
            words.extend(tokens)
            docs_x.append(tokens)
            docs_y.append(label)

    # Pass over punctuation tokens
    ignored_tokens = [',', '.', '?', '!']
    words = [stemmer.stem(w.lower()) for w in words if w not in ignored_tokens]

    words = sorted(set(words))
    labels = sorted(labels)

    training = []
    output = []

    # Template for the BOW
    out_empty = list(np.zeros(len(labels)))

    for x, doc in enumerate(docs_x):
        bag = []
        stemmed = [stemmer.stem(w) for w in doc]

        for w in words:
            if w in stemmed:
                bag.append(1)
            else:
                bag.append(0)

        output_row = out_empty[:]
        output_row[labels.index(docs_y[x])] = 1

        training.append(bag)
        output.append(output_row)  

    training = np.array(training)
    output = np.array(output)    
    
    return words, labels, training, output

In [4]:
sentences = []
for key in data.keys():
    for pattern in data[key]['patterns']:
        sentences.append(pattern.split())

In [8]:
w2v_model = Word2Vec(sentences, size=32, window=3, sg=1, iter=10)

In [9]:
w2v_model.wv['account']

array([ 0.00454417, -0.00302309,  0.01508945, -0.01440222,  0.00749125,
       -0.00686202,  0.01226617, -0.00274484, -0.0072758 , -0.00479306,
       -0.01184115,  0.01131314,  0.00633893, -0.01220932,  0.01088379,
        0.00815971,  0.01139953,  0.00808345,  0.01503804, -0.01106866,
       -0.0108741 , -0.01006933, -0.00208098,  0.00145122,  0.00448123,
       -0.00325523,  0.00813822,  0.00424854,  0.00994768, -0.0082657 ,
       -0.00750369,  0.00569355], dtype=float32)

In [10]:
# vector_dict = {}

# for word in words:
#     vector_dict[word] = w2v_model[]
vector_dict = w2v_model.wv

In [11]:
vector_dict['account']

array([ 0.00454417, -0.00302309,  0.01508945, -0.01440222,  0.00749125,
       -0.00686202,  0.01226617, -0.00274484, -0.0072758 , -0.00479306,
       -0.01184115,  0.01131314,  0.00633893, -0.01220932,  0.01088379,
        0.00815971,  0.01139953,  0.00808345,  0.01503804, -0.01106866,
       -0.0108741 , -0.01006933, -0.00208098,  0.00145122,  0.00448123,
       -0.00325523,  0.00813822,  0.00424854,  0.00994768, -0.0082657 ,
       -0.00750369,  0.00569355], dtype=float32)

In [13]:
# If a pickle file of the processed training data exists, then it will be loaded
# Otherwise, the training data will be processed and saved in a pickle file

if path.exists('./data.pickle'):
    with open('data.pickle', 'rb') as file:
        words, labels, training, output = pickle.load(file)
else:
    words, labels, training, output = preprocess_train_data(data)
    with open('data.pickle', 'wb') as file:
        pickle.dump((words, labels, training, output), file)

In [16]:
# Run this cell to create and train a new model
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

tf.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net,8)
net = tflearn.fully_connected(net,8)
net = tflearn.fully_connected(net,8)
net = tflearn.fully_connected(net,len(output[0]), activation='softmax')
net = tflearn.regression(net)

model = tflearn.DNN(net)
model.fit(training, output, n_epoch=250, batch_size=8, show_metric=True)
model.save('model.tflearn')


Training Step: 1599  | total loss: [1m[32m0.44627[0m[0m | time: 0.056s
| Adam | epoch: 200 | loss: 0.44627 - acc: 0.8385 -- iter: 56/60
Training Step: 1600  | total loss: [1m[32m0.43179[0m[0m | time: 0.064s
| Adam | epoch: 200 | loss: 0.43179 - acc: 0.8547 -- iter: 60/60
--
INFO:tensorflow:c:\Users\owner\Documents\GitHub\CustomerServiceBot-RW\cs-bagofwords\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [None]:
# Run this cell to load a previously trained model

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

tf.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net,8)
net = tflearn.fully_connected(net,8)
net = tflearn.fully_connected(net,8)
net = tflearn.fully_connected(net,len(output[0]), activation='softmax')
net = tflearn.regression(net)

model = tflearn.DNN(net)
model.load('model.tflearn')


In [None]:
def bag_of_words(s, words, stemmer):
    bag = list(np.zeros(len(words)))
    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        for i, w in enumerate(words):
            if w == se:
                bag[i]=1
    return np.array(bag)

In [None]:
def clear(): 
    # Uses os.system and os.name
    # for windows 
    if name == 'nt': 
        _ = system('cls') 
    # for mac and linux 
    else: 
        _ = system('clear') 

In [22]:
def chat():

    clear()
    greetings = ["Hello! How can I help you today?", "Hello! What do you need help with today?", "Hi there, how can I help?"]
    print(choice(greetings))

    stemmer = LancasterStemmer()

    def filter_punctuation(s):
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        return regex.sub('', s)

    while True:
        prediction = None

        inp = filter_punctuation(input("You: "))
        print(inp)

        if inp.lower() in ['quit', 'exit', 'stop']:
            break

        results = model.predict([bag_of_words(inp, words, stemmer)])
        result_index = np.argmax(results)
        result_max = np.max(results)

        if result_max > 0.66:

            prediction = labels[result_index]
            responses = data[prediction]['responses']
            response = choice(responses)

            # print(prediction)

            print('Bot: ', choice(responses))
                    
        if prediction is 'goodbye':
            break

        elif prediction is None:
            print(f"Bot: I'm sorry, I didn't quite get that. Can you rephrase your question?")

        else:
            print(choice())



In [23]:
chat()

Hello! What do you need help with today?
how to open new account
Bot:  To open a new account, you'll need proof of identification and an initial deposit of at least $100.00. Please call the customer service line to speak with a representative. You can also use our website to create one using the Account Creation tool or to find a location near you.
what about closing an account
Bot:  To open a new account, you'll need proof of identification and an initial deposit of at least $100.00. Please call the customer service line to speak with a representative. You can also use our website to create one using the Account Creation tool or to find a location near you.
closing an account
Bot:  To open a new account, you'll need proof of identification and an initial deposit of at least $100.00. Please call the customer service line to speak with a representative. You can also use our website to create one using the Account Creation tool or to find a location near you.
i want to close my account
B