In [1]:
import pandas as pd
import zipfile
import io
import numpy as np

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.callbacks import ModelCheckpoint, TensorBoard


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Using TensorFlow backend.


In [2]:
def load_dataset(filename):
    df = pd.read_excel(filename)
    intent = df["intent"]
    unique_intent = list(set(intent))
    text = list(df["text"])
    return (intent, unique_intent, text, df)


def load_word_vectors(filepath,vocab,outfile):
    with zipfile.ZipFile(filepath) as zfile:
        with open(outfile,"w") as sub_vector:
            for finfo in zfile.infolist():
                ifile = zfile.open(finfo)
                textStream = io.TextIOWrapper(ifile, encoding='utf-8')
                #n, d = map(int, textStream.readline().split())
                #print("Number of tokens = "+str(n))
                data = {}
                lines=[]
                for line in textStream:
                    tokens = line.rstrip().split(' ')
                    word=tokens[0]
                    if(word in vocab):
                        lines.append(line)
                        data[word] = np.asarray(tokens[1:], dtype='float32')
                sub_vector.writelines(lines);
        return data

        
def build_embedding_matrix(tok,vectors,embedding_size):
    vocab_size=len(tok.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_size))
    for word, i in tok.word_index.items():
        embedding_vector = vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def get_vocab(sentences):
    text_words=list(map(lambda x: text_to_word_sequence(x,lower=False),text))
    vocab=set()
    for words in text_words:
        vocab.update(words)
    return vocab


def prepare_training_data(text,intent,max_length,embedding_size):
    tok = Tokenizer(lower=False)
    tok.fit_on_texts(text)
    encoded_docs = tok.texts_to_sequences(text)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    embedding_matrix = build_embedding_matrix(tok,glove_vectors,embedding_size)
    vocab_size = len(tok.word_index) + 1
    le = LabelEncoder()
    labels_encoded=le.fit_transform(intent)
    ohe = OneHotEncoder(sparse=False,categories='auto')
    output_one_hot = ohe.fit_transform(labels_encoded.reshape(-1, 1))
    train_X, val_X, train_Y, val_Y = train_test_split(padded_docs, output_one_hot, shuffle = True, test_size = 0.2)
    return (train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix)


def build_model(embedding_matrix,max_length, label_count, use_pre_trained_vectors=True):
    model = Sequential()
    if use_pre_trained_vectors:
        e = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], 
                      input_length=max_length, trainable=False)
    else:
        e = Embedding(vocab_size, embedding_size, input_length=max_length)
    
    model.add(e)
    model.add(Bidirectional(GRU(256)))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(label_count, activation = "softmax"))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    return model

In [3]:
intent, unique_intent, text, df = load_dataset("./data/cogcomp_org_Experimental_Data_for_Question_Classification.xlsx")

In [4]:
vocab=get_vocab(text)

In [5]:
label_count=len(unique_intent)

In [6]:
glove_vectors=load_word_vectors("./data/pre-trained-vectors/crawl-300d-2M-subword.zip",vocab,'./data/pre-trained-vectors/crawl-300d-2M.subset_for_vocab.vec')

In [7]:
max_length=20
embedding_size = 300

train_X, val_X, train_Y, val_Y, vocab_size, embedding_matrix = prepare_training_data(text,intent,max_length,embedding_size)

In [8]:
model = build_model(embedding_matrix,max_length,label_count,use_pre_trained_vectors=True)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [9]:

#model.summary()

In [10]:
filepath="./data/model/pre-trained_vectors/model-epoch-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5"
log_path="./logs/pre-trained/"
tboard_callback=TensorBoard(log_dir=log_path)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint,tboard_callback]

In [11]:
model.fit(train_X, train_Y, epochs = 500, batch_size = 32, validation_data = (val_X, val_Y),callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Train on 4361 samples, validate on 1091 samples
Epoch 1/500

Epoch 00001: val_acc improved from -inf to 0.38313, saving model to ./data/model/pre-trained_vectors/model-epoch-01-val_acc-0.38.hdf5
Epoch 2/500

Epoch 00002: val_acc improved from 0.38313 to 0.45280, saving model to ./data/model/pre-trained_vectors/model-epoch-02-val_acc-0.45.hdf5
Epoch 3/500

Epoch 00003: val_acc improved from 0.45280 to 0.46471, saving model to ./data/model/pre-trained_vectors/model-epoch-03-val_acc-0.46.hdf5
Epoch 4/500

Epoch 00004: val_acc improved from 0.46471 to 0.51512, saving model to ./data/model/pre-trained_vectors/model-epoch-04-val_acc-0.52.hdf5
Epoch 5/500

Epoch 00005: val_acc did not improve from 0.51512
Epoch 6/500

Epoch 00006: val_acc improved from 0.51512 to 0.53712, saving model to ./data/model/pre-trained_vectors/model-epoch-06-val_acc-0.54.hdf5
Epoch 7/500

Epoch 00007: val_acc improved from 0.53712 to 0.56095, saving model to ./data/mod


Epoch 00034: val_acc did not improve from 0.75435
Epoch 35/500

Epoch 00035: val_acc did not improve from 0.75435
Epoch 36/500

Epoch 00036: val_acc improved from 0.75435 to 0.77635, saving model to ./data/model/pre-trained_vectors/model-epoch-36-val_acc-0.78.hdf5
Epoch 37/500

Epoch 00037: val_acc did not improve from 0.77635
Epoch 38/500

Epoch 00038: val_acc did not improve from 0.77635
Epoch 39/500

Epoch 00039: val_acc did not improve from 0.77635
Epoch 40/500

Epoch 00040: val_acc improved from 0.77635 to 0.78185, saving model to ./data/model/pre-trained_vectors/model-epoch-40-val_acc-0.78.hdf5
Epoch 41/500

Epoch 00041: val_acc did not improve from 0.78185
Epoch 42/500

Epoch 00042: val_acc improved from 0.78185 to 0.78735, saving model to ./data/model/pre-trained_vectors/model-epoch-42-val_acc-0.79.hdf5
Epoch 43/500

Epoch 00043: val_acc did not improve from 0.78735
Epoch 44/500

Epoch 00044: val_acc did not improve from 0.78735
Epoch 45/500

Epoch 00045: val_acc did not impro


Epoch 00075: val_acc did not improve from 0.80293
Epoch 76/500

Epoch 00076: val_acc did not improve from 0.80293
Epoch 77/500

Epoch 00077: val_acc did not improve from 0.80293
Epoch 78/500

Epoch 00078: val_acc did not improve from 0.80293
Epoch 79/500

Epoch 00079: val_acc did not improve from 0.80293
Epoch 80/500

Epoch 00080: val_acc did not improve from 0.80293
Epoch 81/500

Epoch 00081: val_acc improved from 0.80293 to 0.80477, saving model to ./data/model/pre-trained_vectors/model-epoch-81-val_acc-0.80.hdf5
Epoch 82/500

Epoch 00082: val_acc did not improve from 0.80477
Epoch 83/500

Epoch 00083: val_acc did not improve from 0.80477
Epoch 84/500

Epoch 00084: val_acc did not improve from 0.80477
Epoch 85/500

Epoch 00085: val_acc did not improve from 0.80477
Epoch 86/500

Epoch 00086: val_acc did not improve from 0.80477
Epoch 87/500

Epoch 00087: val_acc did not improve from 0.80477
Epoch 88/500

Epoch 00088: val_acc did not improve from 0.80477
Epoch 89/500

Epoch 00089: val


Epoch 00117: val_acc did not improve from 0.81760
Epoch 118/500

Epoch 00118: val_acc did not improve from 0.81760
Epoch 119/500

Epoch 00119: val_acc did not improve from 0.81760
Epoch 120/500

Epoch 00120: val_acc did not improve from 0.81760
Epoch 121/500

Epoch 00121: val_acc did not improve from 0.81760
Epoch 122/500

Epoch 00122: val_acc did not improve from 0.81760
Epoch 123/500

Epoch 00123: val_acc improved from 0.81760 to 0.82218, saving model to ./data/model/pre-trained_vectors/model-epoch-123-val_acc-0.82.hdf5
Epoch 124/500

Epoch 00124: val_acc did not improve from 0.82218
Epoch 125/500

Epoch 00125: val_acc did not improve from 0.82218
Epoch 126/500

Epoch 00126: val_acc did not improve from 0.82218
Epoch 127/500

Epoch 00127: val_acc did not improve from 0.82218
Epoch 128/500

Epoch 00128: val_acc did not improve from 0.82218
Epoch 129/500

Epoch 00129: val_acc did not improve from 0.82218
Epoch 130/500

Epoch 00130: val_acc did not improve from 0.82218
Epoch 131/500

E


Epoch 00160: val_acc did not improve from 0.82218
Epoch 161/500

Epoch 00161: val_acc did not improve from 0.82218
Epoch 162/500

Epoch 00162: val_acc did not improve from 0.82218
Epoch 163/500

Epoch 00163: val_acc did not improve from 0.82218
Epoch 164/500

Epoch 00164: val_acc did not improve from 0.82218
Epoch 165/500

Epoch 00165: val_acc did not improve from 0.82218
Epoch 166/500

Epoch 00166: val_acc did not improve from 0.82218
Epoch 167/500

Epoch 00167: val_acc did not improve from 0.82218
Epoch 168/500

Epoch 00168: val_acc did not improve from 0.82218
Epoch 169/500

Epoch 00169: val_acc did not improve from 0.82218
Epoch 170/500

Epoch 00170: val_acc did not improve from 0.82218
Epoch 171/500

Epoch 00171: val_acc did not improve from 0.82218
Epoch 172/500

Epoch 00172: val_acc did not improve from 0.82218
Epoch 173/500

Epoch 00173: val_acc did not improve from 0.82218
Epoch 174/500

Epoch 00174: val_acc did not improve from 0.82218
Epoch 175/500

Epoch 00175: val_acc did


Epoch 00203: val_acc did not improve from 0.82218
Epoch 204/500

Epoch 00204: val_acc improved from 0.82218 to 0.82218, saving model to ./data/model/pre-trained_vectors/model-epoch-204-val_acc-0.82.hdf5
Epoch 205/500

Epoch 00205: val_acc did not improve from 0.82218
Epoch 206/500

Epoch 00206: val_acc did not improve from 0.82218
Epoch 207/500

Epoch 00207: val_acc did not improve from 0.82218
Epoch 208/500

Epoch 00208: val_acc improved from 0.82218 to 0.82310, saving model to ./data/model/pre-trained_vectors/model-epoch-208-val_acc-0.82.hdf5
Epoch 209/500

Epoch 00209: val_acc did not improve from 0.82310
Epoch 210/500

Epoch 00210: val_acc did not improve from 0.82310
Epoch 211/500

Epoch 00211: val_acc did not improve from 0.82310
Epoch 212/500

Epoch 00212: val_acc did not improve from 0.82310
Epoch 213/500

Epoch 00213: val_acc did not improve from 0.82310
Epoch 214/500

Epoch 00214: val_acc did not improve from 0.82310
Epoch 215/500

Epoch 00215: val_acc did not improve from 0


Epoch 00246: val_acc did not improve from 0.82493
Epoch 247/500

Epoch 00247: val_acc did not improve from 0.82493
Epoch 248/500

Epoch 00248: val_acc did not improve from 0.82493
Epoch 249/500

Epoch 00249: val_acc did not improve from 0.82493
Epoch 250/500

Epoch 00250: val_acc did not improve from 0.82493
Epoch 251/500

Epoch 00251: val_acc did not improve from 0.82493
Epoch 252/500

Epoch 00252: val_acc did not improve from 0.82493
Epoch 253/500

Epoch 00253: val_acc did not improve from 0.82493
Epoch 254/500

Epoch 00254: val_acc did not improve from 0.82493
Epoch 255/500

Epoch 00255: val_acc did not improve from 0.82493
Epoch 256/500

Epoch 00256: val_acc did not improve from 0.82493
Epoch 257/500

Epoch 00257: val_acc did not improve from 0.82493
Epoch 258/500

Epoch 00258: val_acc did not improve from 0.82493
Epoch 259/500

Epoch 00259: val_acc did not improve from 0.82493
Epoch 260/500

Epoch 00260: val_acc did not improve from 0.82493
Epoch 261/500

Epoch 00261: val_acc did


Epoch 00289: val_acc did not improve from 0.82493
Epoch 290/500

Epoch 00290: val_acc did not improve from 0.82493
Epoch 291/500

Epoch 00291: val_acc did not improve from 0.82493
Epoch 292/500

Epoch 00292: val_acc did not improve from 0.82493
Epoch 293/500

Epoch 00293: val_acc did not improve from 0.82493
Epoch 294/500

Epoch 00294: val_acc did not improve from 0.82493
Epoch 295/500

Epoch 00295: val_acc improved from 0.82493 to 0.83043, saving model to ./data/model/pre-trained_vectors/model-epoch-295-val_acc-0.83.hdf5
Epoch 296/500

Epoch 00296: val_acc did not improve from 0.83043
Epoch 297/500

Epoch 00297: val_acc did not improve from 0.83043
Epoch 298/500

Epoch 00298: val_acc did not improve from 0.83043
Epoch 299/500

Epoch 00299: val_acc did not improve from 0.83043
Epoch 300/500

Epoch 00300: val_acc did not improve from 0.83043
Epoch 301/500

Epoch 00301: val_acc did not improve from 0.83043
Epoch 302/500

Epoch 00302: val_acc did not improve from 0.83043
Epoch 303/500

E


Epoch 00333: val_acc did not improve from 0.83043
Epoch 334/500

Epoch 00334: val_acc did not improve from 0.83043
Epoch 335/500

Epoch 00335: val_acc did not improve from 0.83043
Epoch 336/500

Epoch 00336: val_acc did not improve from 0.83043
Epoch 337/500

Epoch 00337: val_acc did not improve from 0.83043
Epoch 338/500

Epoch 00338: val_acc did not improve from 0.83043
Epoch 339/500

Epoch 00339: val_acc did not improve from 0.83043
Epoch 340/500

Epoch 00340: val_acc did not improve from 0.83043
Epoch 341/500

Epoch 00341: val_acc did not improve from 0.83043
Epoch 342/500

Epoch 00342: val_acc did not improve from 0.83043
Epoch 343/500

Epoch 00343: val_acc did not improve from 0.83043
Epoch 344/500

Epoch 00344: val_acc did not improve from 0.83043
Epoch 345/500

Epoch 00345: val_acc did not improve from 0.83043
Epoch 346/500

Epoch 00346: val_acc did not improve from 0.83043
Epoch 347/500

Epoch 00347: val_acc did not improve from 0.83043
Epoch 348/500

Epoch 00348: val_acc did


Epoch 00376: val_acc did not improve from 0.83043
Epoch 377/500

Epoch 00377: val_acc did not improve from 0.83043
Epoch 378/500

Epoch 00378: val_acc did not improve from 0.83043
Epoch 379/500

Epoch 00379: val_acc did not improve from 0.83043
Epoch 380/500

Epoch 00380: val_acc did not improve from 0.83043
Epoch 381/500

Epoch 00381: val_acc did not improve from 0.83043
Epoch 382/500

Epoch 00382: val_acc did not improve from 0.83043
Epoch 383/500

Epoch 00383: val_acc did not improve from 0.83043
Epoch 384/500

Epoch 00384: val_acc improved from 0.83043 to 0.83135, saving model to ./data/model/pre-trained_vectors/model-epoch-384-val_acc-0.83.hdf5
Epoch 385/500

Epoch 00385: val_acc did not improve from 0.83135
Epoch 386/500

Epoch 00386: val_acc did not improve from 0.83135
Epoch 387/500

Epoch 00387: val_acc improved from 0.83135 to 0.83226, saving model to ./data/model/pre-trained_vectors/model-epoch-387-val_acc-0.83.hdf5
Epoch 388/500

Epoch 00388: val_acc did not improve from 0


Epoch 00419: val_acc did not improve from 0.83410
Epoch 420/500

Epoch 00420: val_acc did not improve from 0.83410
Epoch 421/500

Epoch 00421: val_acc did not improve from 0.83410
Epoch 422/500

Epoch 00422: val_acc did not improve from 0.83410
Epoch 423/500

Epoch 00423: val_acc did not improve from 0.83410
Epoch 424/500

Epoch 00424: val_acc did not improve from 0.83410
Epoch 425/500

Epoch 00425: val_acc did not improve from 0.83410
Epoch 426/500

Epoch 00426: val_acc did not improve from 0.83410
Epoch 427/500

Epoch 00427: val_acc did not improve from 0.83410
Epoch 428/500

Epoch 00428: val_acc did not improve from 0.83410
Epoch 429/500

Epoch 00429: val_acc did not improve from 0.83410
Epoch 430/500

Epoch 00430: val_acc did not improve from 0.83410
Epoch 431/500

Epoch 00431: val_acc improved from 0.83410 to 0.83593, saving model to ./data/model/pre-trained_vectors/model-epoch-431-val_acc-0.84.hdf5
Epoch 432/500

Epoch 00432: val_acc did not improve from 0.83593
Epoch 433/500

E


Epoch 00461: val_acc did not improve from 0.83868
Epoch 462/500

Epoch 00462: val_acc did not improve from 0.83868
Epoch 463/500

Epoch 00463: val_acc did not improve from 0.83868
Epoch 464/500

Epoch 00464: val_acc did not improve from 0.83868
Epoch 465/500

Epoch 00465: val_acc did not improve from 0.83868
Epoch 466/500

Epoch 00466: val_acc did not improve from 0.83868
Epoch 467/500

Epoch 00467: val_acc did not improve from 0.83868
Epoch 468/500

Epoch 00468: val_acc did not improve from 0.83868
Epoch 469/500

Epoch 00469: val_acc did not improve from 0.83868
Epoch 470/500

Epoch 00470: val_acc did not improve from 0.83868
Epoch 471/500

Epoch 00471: val_acc did not improve from 0.83868
Epoch 472/500

Epoch 00472: val_acc did not improve from 0.83868
Epoch 473/500

Epoch 00473: val_acc did not improve from 0.83868
Epoch 474/500

Epoch 00474: val_acc did not improve from 0.83868
Epoch 475/500

Epoch 00475: val_acc did not improve from 0.83868
Epoch 476/500

Epoch 00476: val_acc did

<keras.callbacks.History at 0x22b6602f240>