/
qa-dense-autoencoder.py
51 lines (40 loc) · 1.87 KB
/
qa-dense-autoencoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-
from __future__ import division, print_function
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.cross_validation import train_test_split
import numpy as np
import os
import kaggle
DATA_DIR = "../data/comp_data"
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
STORY_FILE = "studystack_qa_cleaner_no_qm.txt"
STORY_WEIGHTS = "dense-story-weights.txt"
STORY_BIAS = "dense-story-bias.txt"
EMBED_SIZE = 64
BATCH_SIZE = 256
NBR_EPOCHS = 20
stories = kaggle.get_stories(os.path.join(DATA_DIR, STORY_FILE))
story_maxlen = max([len(words) for words in stories])
# this part is only required to get the maximum sequence length
qapairs = kaggle.get_question_answer_pairs(
os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([story_maxlen, question_maxlen, answer_maxlen])
word2idx = kaggle.build_vocab(stories, qapairs, [])
vocab_size = len(word2idx)
Xs = kaggle.vectorize_stories(stories, word2idx, seq_maxlen)
Xstrain, Xstest = train_test_split(Xs, test_size=0.3, random_state=42)
print(Xstrain.shape, Xstest.shape)
signal = Input(shape=(seq_maxlen,))
encoded = Dense(EMBED_SIZE, init="glorot_uniform", activation="relu")(signal)
decoded = Dense(seq_maxlen, init="glorot_uniform", activation="sigmoid")(encoded)
autoencoder = Model(input=signal, output=decoded)
autoencoder.compile("adadelta", loss="binary_crossentropy")
autoencoder.fit(Xstrain, Xstrain, nb_epoch=NBR_EPOCHS, batch_size=BATCH_SIZE,
shuffle=True, validation_data=(Xstest, Xstest))
# save weight matrix for embedding (transforms from seq_maxlen to EMBED_SIZE)
weight_matrix, bias_vector = autoencoder.layers[1].get_weights()
np.savetxt(os.path.join(DATA_DIR, STORY_WEIGHTS), weight_matrix)
np.savetxt(os.path.join(DATA_DIR, STORY_BIAS), bias_vector)