In [293]:
import import_ipynb
from model import *

In [294]:
def padding(max_len, x):
    x_len = x.shape[0]
    if(x_len >= max_len):
        return x[0:max_len]
    else:
        if(len(x.shape) > 1):
            dim = x.shape[1]
            a = x
            b = np.ones((max_len-x_len, dim))*-1
            return np.concatenate((a, b))
        else:
            a = x
            b = np.zeros(max_len-x_len)
            return np.concatenate((a, b))
        
def reshape_mask(max_len, x):
    dim=50
    padded_question_mask = padding(max_len, x)
    final_question_mask = []
    for qm in padded_question_mask:
        final_question_mask.append([qm]*dim)
    return final_question_mask
            
c = padding(500, data[4]["correct_triplets"])
print(c.shape)

(500, 3)


In [295]:
dimOfQuestionVector = 300
dimOfTripletVector = 900

In [309]:
# create inputs for model
import glob
import os
import json
import re

def clean(input):
    print(input)
    return ' '.join(re.sub(r'[^a-zA-Z]', ' ', input).split())
    #return re.sub(r'[^a-zA-Z]', ' ', input)

def clean_list(input):
    clean_output = []
    for triplet in input:
        clean_triplet = []
        for i in triplet:
            clean_triplet.append(clean(i))
        clean_output.append(np.asarray(clean_triplet))
    return clean_output

def prepare_data(data, question_max_words=60, triplets_max_numbers=500):
    question_vectors = []
    question_masks = []
    triplet_vectors = []
    Y = []
    for elem in data:
        clean_question = elem["question"]
        clean_question_vector = padding(question_max_words, clean_question)
        clean_mask = elem["question_mask"]
        clean_mask_vector = reshape_mask(question_max_words, clean_mask)
        clean_correct_triplets = elem["correct_triplets"]
        clean_wrong_triplets = elem["wrong_triplets"]
        
        # for correct triplets
        correct_triplets_vectors = []
        for tr in clean_correct_triplets: # There are 500 triplets
            # Each tr has s, p and o. Each of the _s, _p and _o's are (num_tokens, 300) size
            _s = np.mean(tr[0], axis=0)
            _p = np.mean(tr[1], axis=0)
            _o = np.mean(tr[2], axis=0)
            correct_triplets_vectors.append(np.concatenate((_s, _p, _o))) # adding (900,) vector for each triplet
        triplet_vectors.append(padding(triplets_max_numbers, np.asarray(correct_triplets_vectors)))
        question_vectors.append(clean_question_vector)
        question_masks.append(clean_mask_vector)
        Y.append(1)

        # for wrong triplets
        wrong_triplets_vectors = []
        for tr in clean_wrong_triplets:
            _s = np.mean(tr[0], axis=0)
            _p = np.mean(tr[1], axis=0)
            _o = np.mean(tr[2], axis=0)
            wrong_triplets_vectors.append(np.concatenate((_s, _p, _o)))
        triplet_vectors.append(padding(triplets_max_numbers, np.asarray(wrong_triplets_vectors)))
        question_vectors.append(clean_question_vector)
        question_masks.append(clean_mask_vector)
        Y.append(0)
    return question_vectors, question_masks, triplet_vectors, Y

def generate_model_inputs():
    file_list = glob.glob('/Users/surthi/Downloads/train_*.ipwk')
    data = []
    for inputFile in file_list:
        f = open(inputFile, 'rb')
        test = pickle.load(f)
        data = np.append(data, test)
        f.close()
    
    return prepare_data(data, question_max_words=60, triplets_max_numbers=500)

question_vectors1, question_masks1, triplet_vectors1, Y1 = generate_model_inputs()#prepare_data(data, question_max_words=60, triplets_max_numbers=500)

['/Users/surthi/Downloads/train_2.ipwk']


In [313]:
questions_input_shape = (maxWordsPerSentence, dimOfQuestionVector, )
questions_mask_input_shape = (maxWordsPerSentence, 50, )
triplets_input_shape = (maxTriplets, dimOfTripletVector, )

# question_vectors(, 60, 300), question_masks(, 60, 300), triplet_vectors (,500,900), Y
len(question_vectors[0][0])

300

In [292]:
# question inputs
question_vectors_input = Input(shape=questions_input_shape, dtype='float32', name="question_indices_input")   
question_masks_input = Input(shape=questions_mask_input_shape, dtype='float32', name="question_masks_input")

# triplets input
triplets_input = Input(shape=triplets_input_shape, dtype='float32', name="triplets_input")  #(None, 500, 900)

# layers
# gru1 = Bidirectional(GRU( 50, return_sequences=True, name="Sentence_GRU1"))
# gru2 = Bidirectional(GRU( 50, return_sequences=False, name="Sentence_GRU2"))
# tgru1 = Bidirectional(GRU( 50, return_sequences=True, name="Triplet_GRU1"))
# tgru2 = Bidirectional(GRU( 50, return_sequences=False, name="Triplet_GRU2"))

gru1 = (GRU( 50, return_sequences=True, name="Sentence_GRU1"))
gru2 = (GRU( 50, return_sequences=True, name="Sentence_GRU2"))
tgru1 = (GRU( 50, return_sequences=True, name="Triplet_GRU1"))
tgru2 = (GRU( 50, return_sequences=False, name="Triplet_GRU2"))
mean_layer = Lambda(lambda xin: mean(xin, axis=1), name="AMeanLayer")

X_question = gru2(gru1(question_vectors_input))
X_question = Multiply()([question_masks_input, X_question]) # (None, 60, 50)
X_question = mean_layer(X_question)

X_triplets = Dense(50, activation='relu', name="Dense50T")(triplets_input)#tgru2(tgru1(triplets_input))
X_triplets = tgru2(tgru1(X_triplets))
X_concatenated = Concatenate()([X_question, X_triplets])
X_concatenated = Dense(64, activation='relu', name="Dense64")(X_concatenated)
Y_pred = Dense(1, activation='sigmoid', name="Dense1")(X_concatenated)

m.v3 = Model(inputs=[question_vectors_input, question_masks_input, triplets_input], outputs=Y_pred)
print(m.v3.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question_indices_input (InputLa (None, 60, 300)      0                                            
__________________________________________________________________________________________________
Sentence_GRU1 (GRU)             (None, 60, 50)       52650       question_indices_input[0][0]     
__________________________________________________________________________________________________
triplets_input (InputLayer)     (None, 500, 900)     0                                            
__________________________________________________________________________________________________
question_masks_input (InputLaye (None, 60, 50)       0                                            
__________________________________________________________________________________________________
Sentence_G

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(inputs=[question_vectors, question_masks, triplet_vectors], Y)

In [70]:
# from keras.layers import Dense, Input, Dropout, LSTM, Activation, SimpleRNN, GRU, Concatenate, Multiply, Reshape, Flatten, Bidirectional

# # layers
# s1_bilstm = Bidirectional(GRU(128, return_sequences = True), name = "s1_bilstm")
# s2_bilstm = Bidirectional(GRU(128, return_sequences = False), name = "s2_bilstm")
# t_bilstm = Bidirectional(GRU(128, return_sequences = False), name = "t_bilstm")
# embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
# unstack_layer = Lambda(lambda xin: myUnstack(xin), name="AUnstackLayer")

# # sentence inputs
# sentence_indices = Input(shape=(maxWordsPerSentence,), dtype='int32', name="sentence_indices")   
# sentence_masks = Input(shape=(2*lstm_dim,), dtype='float32', name="sentence_masks")

# # sentence embeddings
# sentence_embeddings = embedding_layer(sentence_indices)       
# # X_sentence = Reshape((maxWordsPerSentence, -1), name="s_reshape")(sentence_embeddings)
# X_sentence = s1_bilstm(sentence_embeddings)
# X_sentence = Multiply()([sentence_masks, X_sentence]) # (None, 60, 256)
# X_sentence = s2_bilstm(X_sentence)

# # triplets
# triplets_input = Input(shape=(maxTriplets, maxWordsPerSentence,), dtype='int32', name="tripletz") 
# triplet_embeddings = embedding_layer(triplets_input)

# X_triplets = Reshape((maxTriplets, -1))(triplet_embeddings)
# X_triplets = t_bilstm(X_triplets)

# X_concatenated = Concatenate()([X_sentence, X_triplets])
# X_concatenated = Dense(256, activation='relu', name="Dense256")(X_concatenated)
# X_concatenated = Dense(64, activation='relu', name="Dense64")(X_concatenated)
# Y_pred = Dense(1, activation='sigmoid', name="Dense1")(X_concatenated)

# m.v2 = Model(inputs=[sentence_indices, sentence_masks, triplets_input], outputs=X_concatenated)
# print(m.v2.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sentence_indices (InputLayer)   (None, 60)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             20000050    sentence_indices[0][0]           
                                                                 tripletz[0][0]                   
__________________________________________________________________________________________________
tripletz (InputLayer)           (None, 500, 60)      0                                            
__________________________________________________________________________________________________
sentence_masks (InputLayer)     (None, 256)          0                                            
__________