In [1]:
import pandas as pd
import zipfile
import io
import numpy as np

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.callbacks import ModelCheckpoint, TensorBoard


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Using TensorFlow backend.


In [2]:
def load_dataset(filename):
    df = pd.read_excel(filename)
    intent = df["intent"]
    unique_intent = list(set(intent))
    text = list(df["text"])
    return (intent, unique_intent, text, df)


def load_word_vectors(filepath,vocab,outfile):
    with zipfile.ZipFile(filepath) as zfile:
        with open(outfile,"w") as sub_vector:
            for finfo in zfile.infolist():
                ifile = zfile.open(finfo)
                textStream = io.TextIOWrapper(ifile, encoding='utf-8')
                #n, d = map(int, textStream.readline().split())
                #print("Number of tokens = "+str(n))
                data = {}
                lines=[]
                for line in textStream:
                    tokens = line.rstrip().split(' ')
                    word=tokens[0]
                    if(word in vocab):
                        lines.append(line)
                        data[word] = np.asarray(tokens[1:], dtype='float32')
                sub_vector.writelines(lines);
        return data

        
def build_embedding_matrix(tok,vectors,embedding_size):
    vocab_size=len(tok.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_size))
    for word, i in tok.word_index.items():
        embedding_vector = vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def get_vocab(sentences):
    text_words=list(map(lambda x: text_to_word_sequence(x,lower=False),text))
    vocab=set()
    for words in text_words:
        vocab.update(words)
    return vocab


def prepare_training_data(text,intent,max_length,embedding_size):
    tok = Tokenizer(lower=False)
    tok.fit_on_texts(text)
    encoded_docs = tok.texts_to_sequences(text)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    embedding_matrix = build_embedding_matrix(tok,glove_vectors,embedding_size)
    vocab_size = len(tok.word_index) + 1
    le = LabelEncoder()
    labels_encoded=le.fit_transform(intent)
    ohe = OneHotEncoder(sparse=False,categories='auto')
    output_one_hot = ohe.fit_transform(labels_encoded.reshape(-1, 1))
    train_X, val_X, train_Y, val_Y = train_test_split(padded_docs, output_one_hot, shuffle = True, test_size = 0.2)
    return (train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix)


def build_model(embedding_matrix,max_length, label_count, use_pre_trained_vectors=True):
    model = Sequential()
    if use_pre_trained_vectors:
        e = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], 
                      input_length=max_length, trainable=False)
    else:
        e = Embedding(vocab_size, embedding_size, input_length=max_length)
    
    model.add(e)
    model.add(Bidirectional(GRU(256)))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(label_count, activation = "softmax"))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    return model

In [3]:
intent, unique_intent, text, df = load_dataset("./data/cogcomp_org_Experimental_Data_for_Question_Classification.xlsx")

In [4]:
vocab=get_vocab(text)

In [5]:
label_count=len(unique_intent)

In [6]:
glove_vectors=load_word_vectors("./data/pre-trained-vectors/crawl-300d-2M-subword.zip",vocab,'./data/pre-trained-vectors/crawl-300d-2M.subset_for_vocab.vec')

In [7]:
max_length=20
embedding_size = 300

train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix = prepare_training_data(text,intent,max_length,embedding_size)

In [8]:
model = build_model(embedding_matrix,max_length,label_count,use_pre_trained_vectors=True)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [9]:

#model.summary()

In [10]:
filepath="./data/model/pre-trained_vectors/model-epoch-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5"
log_path="./logs/pre-trained/"
tboard_callback=TensorBoard(log_dir=log_path)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint,tboard_callback]

In [None]:
model.fit(train_X, train_Y, epochs = 500, batch_size = 32, validation_data = (val_X, val_Y),callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Train on 4361 samples, validate on 1091 samples
Epoch 1/500

Epoch 00001: val_acc improved from -inf to 0.38313, saving model to ./data/model/pre-trained_vectors/model-epoch-01-val_acc-0.38.hdf5
Epoch 2/500

Epoch 00002: val_acc improved from 0.38313 to 0.45280, saving model to ./data/model/pre-trained_vectors/model-epoch-02-val_acc-0.45.hdf5
Epoch 3/500

Epoch 00003: val_acc improved from 0.45280 to 0.46471, saving model to ./data/model/pre-trained_vectors/model-epoch-03-val_acc-0.46.hdf5
Epoch 4/500

Epoch 00004: val_acc improved from 0.46471 to 0.51512, saving model to ./data/model/pre-trained_vectors/model-epoch-04-val_acc-0.52.hdf5
Epoch 5/500

Epoch 00005: val_acc did not improve from 0.51512
Epoch 6/500

Epoch 00006: val_acc improved from 0.51512 to 0.53712, saving model to ./data/model/pre-trained_vectors/model-epoch-06-val_acc-0.54.hdf5
Epoch 7/500

Epoch 00007: val_acc improved from 0.53712 to 0.56095, saving model to ./data/mod


Epoch 00034: val_acc did not improve from 0.75435
Epoch 35/500

Epoch 00035: val_acc did not improve from 0.75435
Epoch 36/500

Epoch 00036: val_acc improved from 0.75435 to 0.77635, saving model to ./data/model/pre-trained_vectors/model-epoch-36-val_acc-0.78.hdf5
Epoch 37/500

Epoch 00037: val_acc did not improve from 0.77635
Epoch 38/500

Epoch 00038: val_acc did not improve from 0.77635
Epoch 39/500

Epoch 00039: val_acc did not improve from 0.77635
Epoch 40/500

Epoch 00040: val_acc improved from 0.77635 to 0.78185, saving model to ./data/model/pre-trained_vectors/model-epoch-40-val_acc-0.78.hdf5
Epoch 41/500

Epoch 00041: val_acc did not improve from 0.78185
Epoch 42/500

Epoch 00042: val_acc improved from 0.78185 to 0.78735, saving model to ./data/model/pre-trained_vectors/model-epoch-42-val_acc-0.79.hdf5
Epoch 43/500

Epoch 00043: val_acc did not improve from 0.78735
Epoch 44/500

Epoch 00044: val_acc did not improve from 0.78735
Epoch 45/500

Epoch 00045: val_acc did not impro