In [1]:
import pandas as pd
import zipfile
import io
import numpy as np

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.callbacks import ModelCheckpoint, TensorBoard


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Using TensorFlow backend.


In [2]:
def load_dataset(filename):
  df = pd.read_excel(filename)
  intent = df["intent"]
  unique_intent = list(set(intent))
  text = list(df["text"])
  return (intent, unique_intent, text, df)


def load_word_vectors(filepath,vocab):
    with zipfile.ZipFile(filepath) as zfile:
        for finfo in zfile.infolist():
            ifile = zfile.open(finfo)
            textStream = io.TextIOWrapper(ifile, encoding='utf-8')
            #n, d = map(int, textStream.readline().split())
            #print("Number of tokens = "+str(n))
            data = {}
            for line in textStream:
                tokens = line.rstrip().split(' ')
                word=tokens[0]
                if(word in vocab):
                    data[word] = np.asarray(tokens[1:], dtype='float32')
            return data

        
def build_embedding_matrix(tok,vectors,embedding_size):
    vocab_size=len(tok.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_size))
    for word, i in tok.word_index.items():
        embedding_vector = vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def get_vocab(sentences):
    text_words=list(map(lambda x: text_to_word_sequence(x,lower=False),text))
    vocab=set()
    for words in text_words:
        vocab.update(words)
    return vocab


def prepare_training_data(text,intent,max_length,embedding_size):
    tok = Tokenizer(lower=False)
    tok.fit_on_texts(text)
    encoded_docs = tok.texts_to_sequences(text)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    embedding_matrix = build_embedding_matrix(tok,glove_vectors,embedding_size)
    vocab_size = len(tok.word_index) + 1
    le = LabelEncoder()
    labels_encoded=le.fit_transform(intent)
    ohe = OneHotEncoder(sparse=False,categories='auto')
    output_one_hot = ohe.fit_transform(labels_encoded.reshape(-1, 1))
    train_X, val_X, train_Y, val_Y = train_test_split(padded_docs, output_one_hot, shuffle = True, test_size = 0.2)
    return (train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix)


def build_model(embedding_matrix,max_length, use_pre_trained_vectors=True):
    model = Sequential()
    if use_pre_trained_vectors:
        e = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], 
                      input_length=max_length, trainable=False)
    else:
        e = Embedding(vocab_size, embedding_size, input_length=max_length)
    
    model.add(e)
    model.add(Bidirectional(GRU(128)))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(21, activation = "softmax"))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    return model

In [3]:
intent, unique_intent, text, df = load_dataset("../data/chatbot_questions.xlsx")

In [4]:
vocab=get_vocab(text)

In [5]:
glove_vectors=load_word_vectors("../data/pre-trained-vectors/crawl-300d-2M.vec.zip",vocab)

In [6]:
max_length=20
embedding_size = 300

train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix = prepare_training_data(text,intent,max_length,embedding_size)

In [15]:
model = build_model(embedding_matrix,max_length,use_pre_trained_vectors=True)

In [16]:

#model.summary()

In [17]:
filepath="../data/model/pre-trained_vectors/model-epoch-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5"
log_path="../logs/pre-trained/"
tboard_callback=TensorBoard(log_dir=log_path)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint,tboard_callback]

In [18]:
model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y),callbacks=callbacks_list)

Train on 890 samples, validate on 223 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.26457, saving model to ../data/model/pre-trained_vectors/model-epoch-01-val_acc-0.26.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.26457 to 0.47982, saving model to ../data/model/pre-trained_vectors/model-epoch-02-val_acc-0.48.hdf5
Epoch 3/100

Epoch 00003: val_acc improved from 0.47982 to 0.56054, saving model to ../data/model/pre-trained_vectors/model-epoch-03-val_acc-0.56.hdf5
Epoch 4/100

Epoch 00004: val_acc improved from 0.56054 to 0.64574, saving model to ../data/model/pre-trained_vectors/model-epoch-04-val_acc-0.65.hdf5
Epoch 5/100

Epoch 00005: val_acc improved from 0.64574 to 0.68161, saving model to ../data/model/pre-trained_vectors/model-epoch-05-val_acc-0.68.hdf5
Epoch 6/100

Epoch 00006: val_acc improved from 0.68161 to 0.78027, saving model to ../data/model/pre-trained_vectors/model-epoch-06-val_acc-0.78.hdf5
Epoch 7/100

Epoch 00007: val_acc improved from 0.7


Epoch 00037: val_acc did not improve from 0.92377
Epoch 38/100

Epoch 00038: val_acc did not improve from 0.92377
Epoch 39/100

Epoch 00039: val_acc did not improve from 0.92377
Epoch 40/100

Epoch 00040: val_acc improved from 0.92377 to 0.93722, saving model to ../data/model/pre-trained_vectors/model-epoch-40-val_acc-0.94.hdf5
Epoch 41/100

Epoch 00041: val_acc did not improve from 0.93722
Epoch 42/100

Epoch 00042: val_acc did not improve from 0.93722
Epoch 43/100

Epoch 00043: val_acc did not improve from 0.93722
Epoch 44/100

Epoch 00044: val_acc did not improve from 0.93722
Epoch 45/100

Epoch 00045: val_acc did not improve from 0.93722
Epoch 46/100

Epoch 00046: val_acc did not improve from 0.93722
Epoch 47/100

Epoch 00047: val_acc did not improve from 0.93722
Epoch 48/100

Epoch 00048: val_acc did not improve from 0.93722
Epoch 49/100

Epoch 00049: val_acc improved from 0.93722 to 0.94170, saving model to ../data/model/pre-trained_vectors/model-epoch-49-val_acc-0.94.hdf5
Epoch


Epoch 00080: val_acc improved from 0.95067 to 0.95964, saving model to ../data/model/pre-trained_vectors/model-epoch-80-val_acc-0.96.hdf5
Epoch 81/100

Epoch 00081: val_acc did not improve from 0.95964
Epoch 82/100

Epoch 00082: val_acc did not improve from 0.95964
Epoch 83/100

Epoch 00083: val_acc did not improve from 0.95964
Epoch 84/100

Epoch 00084: val_acc did not improve from 0.95964
Epoch 85/100

Epoch 00085: val_acc did not improve from 0.95964
Epoch 86/100

Epoch 00086: val_acc did not improve from 0.95964
Epoch 87/100

Epoch 00087: val_acc did not improve from 0.95964
Epoch 88/100

Epoch 00088: val_acc did not improve from 0.95964
Epoch 89/100

Epoch 00089: val_acc did not improve from 0.95964
Epoch 90/100

Epoch 00090: val_acc did not improve from 0.95964
Epoch 91/100

Epoch 00091: val_acc did not improve from 0.95964
Epoch 92/100

Epoch 00092: val_acc did not improve from 0.95964
Epoch 93/100

Epoch 00093: val_acc did not improve from 0.95964
Epoch 94/100

Epoch 00094: va

<keras.callbacks.History at 0x27d326e9588>