In [1]:
import numpy as np
import json
import os

In [2]:
rdata_path='./raw_data'
data_path='./processed_data'

In [3]:
classes=os.listdir(rdata_path)

In [4]:
def process_data(jsfile, clas):
    d = json.load(jsfile)
    text = ''
    for item in[command['data'] for command in d[clas]]:
        for dic in item:
            text += dic.get('text')
        text +='\n'
    return text

In [5]:
for clas in classes:
    if clas=='PlayMusic':
        enc='latin-1'
    else:
        enc='utf-8'
    with open(rdata_path+'/'+clas+'/train_'+clas+'_full.json', encoding=enc) as jsfile:
        text = process_data(jsfile, clas)
    with open(data_path+'/train_'+clas+'.txt', 'w', encoding=enc) as txtfile:
        txtfile.write(text)
    with open(rdata_path+'/'+clas+'/validate_'+clas+'.json', encoding=enc) as jsfile:
        text = process_data(jsfile, clas)
    with open(data_path+'/validate_'+clas+'.txt', 'w', encoding=enc) as txtfile:
        txtfile.write(text)

In [72]:
train_txt=[]
train_label=[]
test_txt=[]
test_labels=[]
for i, clas in enumerate(classes):
    label=[0]*len(classes)
    label[i]=1
    if clas=='PlayMusic':
        enc='latin-1'
    else:
        enc='utf-8'
    with open(data_path+'/train_'+clas+'.txt', encoding=enc) as txtfile:
        for line in txtfile:
            train_txt.append(line.replace('\n','')\
                             .replace("'ve", " 've")\
                             .replace("'s", " 's")\
                             .replace("n't", " n't")\
                             .replace("'s", " 's"))
            train_label.append(label)
    with open(data_path+'/validate_'+clas+'.txt', encoding=enc) as txtfile:
        for line in txtfile:
            test_txt.append(line.replace('\n','')\
                             .replace("'ve", " 've")\
                             .replace("'s", " 's")\
                             .replace("n't", " n't")\
                             .replace("'s", " 's"))
            test_labels.append(label)

In [7]:
ls=[]
for c in train_txt:
    ls.append(len(c.split()))
maxLen=int(np.percentile(ls, 98))

In [8]:
embeddings_index={}
with open(data_path+'/glove50.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
max_num_words = 40000
embedding_dim=len(embeddings_index['the'])
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train_txt)
train_sequences = tokenizer.texts_to_sequences(train_txt)
train_sequences = pad_sequences(train_sequences, maxlen=maxLen, padding='post')
test_sequences = tokenizer.texts_to_sequences(test_txt)
test_sequences = pad_sequences(test_sequences, maxlen=maxLen, padding='post')
word_index = tokenizer.word_index

In [10]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [11]:
num_words = min(MAX_NB_WORDS, len(word_index) )+1
embedding_matrix = np.random.normal(emb_mean, emb_std, (num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= max_num_words:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [14]:
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, GlobalMaxPool1D, GlobalAvgPool1D
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras.layers import Flatten
from keras.layers import Reshape, Dropout, Concatenate
from keras.layers import Conv2D, MaxPool2D/

In [22]:
model = Sequential()
model.add(Embedding(num_words, embedding_dim, trainable=True, weights=[embedding_matrix]))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.1, dropout=0.1), 'concat'))
model.add(Dropout(0.3))
model.add(LSTM(128, return_sequences=False, recurrent_dropout=0.1, dropout=0.1))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(classes), activation='softmax'))

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 50)          573150    
_________________________________________________________________
dropout_5 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 256)         183296    
_________________________________________________________________
dropout_6 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
__________

In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [25]:
model.fit(train_sequences, train_label, epochs = 16,
          batch_size = 64, shuffle=True,
          validation_data=[test_sequences, test_labels])

Train on 13931 samples, validate on 701 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<keras.callbacks.History at 0x233b19d0eb8>

In [26]:
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])

In [27]:
model.fit(train_sequences, train_label, epochs = 16,
          batch_size = 64, shuffle=True,
          validation_data=[test_sequences, test_labels])

Train on 13931 samples, validate on 701 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16

KeyboardInterrupt: 

In [39]:
test_preds=model.predict(test_sequences)

In [42]:
index_word={index:word for word, index in word_index.items()}

In [54]:
false_preds=np.nonzero(~np.equal(np.argmax(test_preds,1),np.argmax(test_labels,1)))[0]

In [73]:
for ind in false_preds:
    print('The command is: {}, The label is:{}, The prediction is:{}\n'\
          .format(test_txt[ind],
                  classes[np.argmax(test_labels[ind])],
                  classes[np.argmax(test_preds[ind])]))

The command is: When is sunrise for AR, The label is:GetWeather, The prediction is:SearchScreeningEvent

The command is: Where is Belgium located, The label is:GetWeather, The prediction is:BookRestaurant

The command is: Live In L.aJoseph Meyer please, The label is:PlayMusic, The prediction is:SearchCreativeWork

The command is: Where can I see The Prime Ministers: The Pioneers, The label is:SearchScreeningEvent, The prediction is:SearchCreativeWork

The command is: I want to see Medal for the General, The label is:SearchScreeningEvent, The prediction is:SearchCreativeWork

The command is: I want to see Shattered Image., The label is:SearchScreeningEvent, The prediction is:SearchCreativeWork

The command is: I want to see Outcast., The label is:SearchScreeningEvent, The prediction is:SearchCreativeWork

