In [None]:
import pandas as pd
import zipfile
import io
import numpy as np

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
def load_dataset(filename):
  df = pd.read_excel(filename)
  intent = df["intent"]
  unique_intent = list(set(intent))
  text = list(df["text"])
  return (intent, unique_intent, text, df)


def load_word_vectors(filepath,vocab):
    with zipfile.ZipFile(filepath) as zfile:
        for finfo in zfile.infolist():
            ifile = zfile.open(finfo)
            textStream = io.TextIOWrapper(ifile, encoding='utf-8')
            #n, d = map(int, textStream.readline().split())
            #print("Number of tokens = "+str(n))
            data = {}
            for line in textStream:
                tokens = line.rstrip().split(' ')
                word=tokens[0]
                if(word in vocab):
                    data[word] = np.asarray(tokens[1:], dtype='float32')
            return data

        
def build_embedding_matrix(tok,vectors,embedding_size):
    vocab_size=len(tok.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_size))
    for word, i in tok.word_index.items():
        embedding_vector = vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def get_vocab(sentences):
    text_words=list(map(lambda x: text_to_word_sequence(x,lower=False),text))
    vocab=set()
    for words in text_words:
        vocab.update(words)
    return vocab


def prepare_training_data(text,intent,max_length,embedding_size):
    tok = Tokenizer(lower=False)
    tok.fit_on_texts(text)
    encoded_docs = tok.texts_to_sequences(text)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    embedding_matrix = build_embedding_matrix(tok,glove_vectors,embedding_size)
    vocab_size = len(tok.word_index) + 1
    le = LabelEncoder()
    labels_encoded=le.fit_transform(intent)
    ohe = OneHotEncoder(sparse=False,categories='auto')
    output_one_hot = ohe.fit_transform(labels_encoded.reshape(-1, 1))
    train_X, val_X, train_Y, val_Y = train_test_split(padded_docs, output_one_hot, shuffle = True, test_size = 0.2)
    return (train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix)


def build_model(embedding_matrix,max_length, use_pre_trained_vectors=True):
    model = Sequential()
    if use_pre_trained_vectors:
        e = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], 
                      input_length=max_length, trainable=False)
    else:
        e = Embedding(vocab_size, embedding_size, input_length=max_length)
    
    model.add(e)
    model.add(Bidirectional(GRU(128)))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(21, activation = "softmax"))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    return model

In [None]:
intent, unique_intent, text, df = load_dataset("../data/chatbot_questions.xlsx")

In [None]:
vocab=get_vocab(text)

In [None]:
glove_vectors=load_word_vectors("../data/pre-trained-vectors/crawl-300d-2M.vec.zip",vocab)

In [None]:
max_length=20
embedding_size = 300

train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix = prepare_training_data(text,intent,max_length,embedding_size)

In [None]:
model = build_model(embedding_matrix,max_length,use_pre_trained_vectors=False)

In [None]:

#model.summary()

In [None]:
filepath="../data/model/train-embedding/model-epoch-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
model.fit(train_X, train_Y, epochs = 200, batch_size = 32, validation_data = (val_X, val_Y),callbacks=callbacks_list)