In [1]:
import pandas as pd
import zipfile
import io
import numpy as np

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.callbacks import ModelCheckpoint, TensorBoard


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Using TensorFlow backend.


In [2]:
def load_dataset(filename):
    df = pd.read_excel(filename)
    intent = df["intent"]
    unique_intent = list(set(intent))
    text = list(df["text"])
    return (intent, unique_intent, text, df)


def load_word_vectors(filepath,vocab,outfile):
    with zipfile.ZipFile(filepath) as zfile:
        with open(outfile,"w") as sub_vector:
            for finfo in zfile.infolist():
                ifile = zfile.open(finfo)
                textStream = io.TextIOWrapper(ifile, encoding='utf-8')
                #n, d = map(int, textStream.readline().split())
                #print("Number of tokens = "+str(n))
                data = {}
                lines=[]
                for line in textStream:
                    tokens = line.rstrip().split(' ')
                    word=tokens[0]
                    if(word in vocab):
                        lines.append(line)
                        data[word] = np.asarray(tokens[1:], dtype='float32')
                sub_vector.writelines(lines);
        return data

        
def build_embedding_matrix(tok,vectors,embedding_size):
    vocab_size=len(tok.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_size))
    for word, i in tok.word_index.items():
        embedding_vector = vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def get_vocab(sentences):
    text_words=list(map(lambda x: text_to_word_sequence(x,lower=False),text))
    vocab=set()
    for words in text_words:
        vocab.update(words)
    return vocab


def prepare_training_data(text,intent,max_length,embedding_size):
    tok = Tokenizer(lower=False)
    tok.fit_on_texts(text)
    encoded_docs = tok.texts_to_sequences(text)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    embedding_matrix = build_embedding_matrix(tok,glove_vectors,embedding_size)
    vocab_size = len(tok.word_index) + 1
    le = LabelEncoder()
    labels_encoded=le.fit_transform(intent)
    ohe = OneHotEncoder(sparse=False,categories='auto')
    output_one_hot = ohe.fit_transform(labels_encoded.reshape(-1, 1))
    train_X, val_X, train_Y, val_Y = train_test_split(padded_docs, output_one_hot, shuffle = True, test_size = 0.2)
    return (train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix)


def build_model(embedding_matrix,max_length, use_pre_trained_vectors=True):
    model = Sequential()
    if use_pre_trained_vectors:
        e = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], 
                      input_length=max_length, trainable=False)
    else:
        e = Embedding(vocab_size, embedding_size, input_length=max_length)
    
    model.add(e)
    model.add(Bidirectional(GRU(128)))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(21, activation = "softmax"))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    return model

In [3]:
intent, unique_intent, text, df = load_dataset("../data/chatbot_questions.xlsx")

In [4]:
vocab=get_vocab(text)

In [5]:
glove_vectors=load_word_vectors("../data/pre-trained-vectors/crawl-300d-2M.vec.zip",vocab,'../data/pre-trained-vectors/crawl-300d-2M.subset_for_vocab.vec')

In [6]:
max_length=20
embedding_size = 300

train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix = prepare_training_data(text,intent,max_length,embedding_size)

In [7]:
model = build_model(embedding_matrix,max_length,use_pre_trained_vectors=True)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [8]:

#model.summary()

In [9]:
filepath="../data/model/pre-trained_vectors/model-epoch-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5"
log_path="../logs/pre-trained/"
tboard_callback=TensorBoard(log_dir=log_path)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint,tboard_callback]

In [10]:
model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y),callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Train on 890 samples, validate on 223 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.14350, saving model to ../data/model/pre-trained_vectors/model-epoch-01-val_acc-0.14.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.14350 to 0.32287, saving model to ../data/model/pre-trained_vectors/model-epoch-02-val_acc-0.32.hdf5
Epoch 3/100

Epoch 00003: val_acc improved from 0.32287 to 0.44395, saving model to ../data/model/pre-trained_vectors/model-epoch-03-val_acc-0.44.hdf5
Epoch 4/100

Epoch 00004: val_acc improved from 0.44395 to 0.52915, saving model to ../data/model/pre-trained_vectors/model-epoch-04-val_acc-0.53.hdf5
Epoch 5/100

Epoch 00005: val_acc improved from 0.52915 to 0.60538, saving model to ../data/model/pre-trained_vectors/model-epoch-05-val_acc-0.61.hdf5
Epoch 6/100

Epoch 00006: val_acc improved from 0.60538 to 0.67713, saving model to ../data/model/pre-trained_vectors/model-epoch-06-val_acc-0.68.hdf5
Epo


Epoch 00034: val_acc did not improve from 0.92825
Epoch 35/100

Epoch 00035: val_acc did not improve from 0.92825
Epoch 36/100

Epoch 00036: val_acc improved from 0.92825 to 0.92825, saving model to ../data/model/pre-trained_vectors/model-epoch-36-val_acc-0.93.hdf5
Epoch 37/100

Epoch 00037: val_acc did not improve from 0.92825
Epoch 38/100

Epoch 00038: val_acc did not improve from 0.92825
Epoch 39/100

Epoch 00039: val_acc did not improve from 0.92825
Epoch 40/100

Epoch 00040: val_acc improved from 0.92825 to 0.93722, saving model to ../data/model/pre-trained_vectors/model-epoch-40-val_acc-0.94.hdf5
Epoch 41/100

Epoch 00041: val_acc did not improve from 0.93722
Epoch 42/100

Epoch 00042: val_acc did not improve from 0.93722
Epoch 43/100

Epoch 00043: val_acc did not improve from 0.93722
Epoch 44/100

Epoch 00044: val_acc improved from 0.93722 to 0.93722, saving model to ../data/model/pre-trained_vectors/model-epoch-44-val_acc-0.94.hdf5
Epoch 45/100

Epoch 00045: val_acc did not im


Epoch 00075: val_acc did not improve from 0.95516
Epoch 76/100

Epoch 00076: val_acc did not improve from 0.95516
Epoch 77/100

Epoch 00077: val_acc did not improve from 0.95516
Epoch 78/100

Epoch 00078: val_acc did not improve from 0.95516
Epoch 79/100

Epoch 00079: val_acc did not improve from 0.95516
Epoch 80/100

Epoch 00080: val_acc did not improve from 0.95516
Epoch 81/100

Epoch 00081: val_acc did not improve from 0.95516
Epoch 82/100

Epoch 00082: val_acc improved from 0.95516 to 0.95516, saving model to ../data/model/pre-trained_vectors/model-epoch-82-val_acc-0.96.hdf5
Epoch 83/100

Epoch 00083: val_acc did not improve from 0.95516
Epoch 84/100

Epoch 00084: val_acc did not improve from 0.95516
Epoch 85/100

Epoch 00085: val_acc did not improve from 0.95516
Epoch 86/100

Epoch 00086: val_acc did not improve from 0.95516
Epoch 87/100

Epoch 00087: val_acc did not improve from 0.95516
Epoch 88/100

Epoch 00088: val_acc did not improve from 0.95516
Epoch 89/100

Epoch 00089: va

<keras.callbacks.History at 0x2a81bde7f60>