In [26]:
### 
# DEPENDENCIES 
###
import nltk
import json
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from matplotlib import pyplot as pltb
from sklearn.metrics import classification_report
%matplotlib notebook  
### 
# GLOBALS 
###
TRAINING_DATA_DIR="./datasets/BioASQ-trainingDataset6b.json"
TRAIN_SPLIT = 0.9
W2V_SIZE=50
W2V_SKIP_GRAM=1
### 
# FUNCTIONS 
###
def parse_questions_types(data):
    return zip(*[[json['body'], json['type']] for json in data['questions']])

def label_to_class(str_labels, label):
    return str_labels.index(label)

def json_to_df(json_file_path):
    with open(json_file_path, 'r') as f:
        return pd.DataFrame(json.load(f))
    
def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    filtered_tokens = []
    for word, pos in nltk.pos_tag(word_tokenize(text)):
#         if len(word) < 2 and word != "?":
#             continue
        filtered_tokens.append(word.lower())        
    return filtered_tokens

def build_vocab_idx(tokenized_q):
    vocab = dict()
    idx = 0
    for q in tokenized_q:
        for word in q:
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab

def one_hot_encode(q_types):
    y = []
    list_classes = list(np.unique(q_types))
    for q_type in q_types:
        enc = np.zeros(len(list_classes))
        enc[list_classes.index(q_type)] = 1
        y.append(enc)
    return np.array(y)

In [36]:
[questions, q_types] = parse_questions_types(json_to_df(TRAINING_DATA_DIR))
tokenized_q = [tokenize(q) for q in questions]
vocab_idx = build_vocab_idx(tokenized_q)
vector_model = Word2Vec(tokenized_q, min_count=0, sg=W2V_SKIP_GRAM, size=W2V_SIZE)
max_nb_words = len(vector_model.wv.vocab)
max_seq_length = max([len(q) for q in tokenized_q])
sequences = [[vocab_idx[t] for t in q] for q in tokenized_q]


X = pad_sequences(sequences, maxlen=max_seq_length, padding="pre", truncating="post")
y = one_hot_encode(q_types)
train_q_idx = round(len(X)*TRAIN_SPLIT)
X_train, y_train = X[:train_q_idx], y[:train_q_idx]
X_test, y_test = X[train_q_idx:], y[train_q_idx:]


wv_matrix = (np.random.rand(max_nb_words, W2V_SIZE) - 0.5) / 5.0

for word, i in vocab_idx.items():
    wv_matrix[i] = vector_model.wv[word]

wv_layer = Embedding(max_nb_words,
                     W2V_SIZE,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=max_seq_length,
                     trainable=False)

# Inputs
sequence_input = Input(shape=(max_seq_length,), dtype='int32')
embedded_sequences = wv_layer(sequence_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)

x = Dense(64, activation='relu')(x)

# Output
# x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(4, activation='sigmoid')(x)

# build the model
model = Model(inputs=[sequence_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=[])

hist = model.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=256, shuffle=True)

history = pd.DataFrame(hist.history)
plt.figure(figsize=(5,5));
plt.plot(history["loss"], 'r');
plt.plot(history["val_loss"], 'b');
plt.title("Loss with pretrained word vectors");
plt.show();

y_hat = model.predict(X_test)
print(classification_report(np.argmax(y_test, axis=1), np.argmax(y_hat, axis=1), target_names=np.unique(q_types)))

ValueError: Layer spatial_dropout1d_11 was called with an input that isn't a symbolic tensor. Received type: <class 'numpy.ndarray'>. Full input: [array([[ 3.37759525e-01,  1.36463165e-01,  9.89217460e-02, ...,
        -4.89831604e-02, -1.76981464e-01,  1.16860926e-01],
       [ 1.36913821e-01,  6.65977672e-02,  6.44709319e-02, ...,
        -4.51426916e-02, -8.68188888e-02, -1.86762027e-02],
       [ 3.18445861e-01,  1.35832369e-01,  1.40888259e-01, ...,
        -7.52645209e-02, -1.98622227e-01,  1.64264441e-02],
       ...,
       [ 3.92016806e-02,  1.47319147e-02,  9.53727867e-03, ...,
        -6.55975984e-03, -2.37124078e-02,  2.07531339e-04],
       [ 4.59352396e-02,  1.40201114e-02,  2.28720028e-02, ...,
        -4.26497497e-03, -2.94809937e-02, -5.46028232e-03],
       [ 2.08940078e-02,  8.19153339e-03,  4.78963973e-03, ...,
        -7.11355032e-03, -1.76139902e-02, -7.87751470e-03]])]. All inputs to the layer should be tensors.