In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map
  
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs('/Users/surthi/Documents/cs230project/glove.6B.50d.txt')
print("Loaded ", len(word_to_vec_map), " model")

Loaded  400000  model


In [2]:
import itertools
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.ones((m, max_len))*-1
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = word_to_index[w] if w in word_to_index else -1
            # Increment j to j + 1
            j = j+1

    return X_indices
  
def sentence_to_target_mask(target, sentence, dim=50):
  target_words = target.split()
  types = []
  words_in_sentence = sentence.lower().split()
  for word in words_in_sentence:
    types.append([1. if word.lower() in target_words else 0.] * dim)
  return types

def triplets_to_indices(tr):
#     tr = triplets[0]['wrong_triplets']
    newrow = [['ljl1', 'ljl2', 'ljl3']]

    # padding to zero
    numberToAppend = 500 - len(tr)
    for i in range(numberToAppend):
        tr = itertools.chain(tr, newrow)
    
    # triplets to indices
    tr1 = []
    for t in tr:
        tr1.append(t[0] +  " " + t[1] + " " + t[2])  
    return sentences_to_indices(np.asarray(tr1),words_to_index, max_len = 60)

In [3]:
import glob
import os
import json

def read_triplets(triplets_file):
    data = []
    
    with open(triplets_file) as f:
        json_data = json.load(f);
        data.extend( json_data)

    print (len(data))

    #now split the data into 2 (one for correct and one for wrong id)
    final_data = []
    for item in data:
        dict = {}
        dict["question"] = item["question"]
        dict["name"]       = item["name"]
        dict["correct_id"] = item["correct_id"]
        dict["correct_triplets"] = item["correct_triplets"]
        dict["result"] = 1
        final_data.append(dict)
        
        dict1 = {}
        dict1["question"] = item["question"]
        dict1["name"]     = item["name"]
        dict1["wrong_id"] = item["wrong_id"]
        dict1["wrong_triplets"] = item["wrong_triplets"]
        dict1["result"] = 0
        final_data.append(dict1)

    print(len(final_data))
    data.clear() # free memory for data
    return final_data

inputData = read_triplets('/Users/surthi/Documents/cs230project/data_wiki_handshake/input_w_wiki_1.ipwk')
# question, name, correct_id, wrong_id, correct_triplets, wrong_triplets

10
20


In [4]:
inputData

[{'question': "Fitzroy North is a suburb in Melbourne, Victoria, Australia, 4 km north-east from Melbourne's central business district. Its Local Government Area are the Cities of Yarra and Moreland. At the ",
  'name': 'victoria',
  'correct_id': 'Q36687',
  'correct_triplets': [['Borough of Koroit', 'country', 'Australia'],
   ['Borough of Koroit', 'country', 'Borough of Koroit'],
   ['Borough of Koroit', 'instance of', 'Borough of Koroit'],
   ['Borough of Koroit',
    'instance of',
    'former local government area of Australia'],
   ['Borough of Koroit',
    'located in the administrative territorial entity',
    'Victoria'],
   ['Borough of Port Fairy', 'country', 'Australia'],
   ['Borough of Port Fairy', 'instance of', 'Borough of Port Fairy'],
   ['Borough of Port Fairy',
    'instance of',
    'former local government area of Australia'],
   ['Borough of Port Fairy', 'instance of', 'geographical object'],
   ['Borough of Port Fairy',
    'located in the administrative territ

In [12]:
from keras.layers.embeddings import Embedding
def pretrained_embedding_layer(word_to_vec_map, words_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    vocab_len = len(words_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in words_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False, name="embedding")

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [13]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = -0.3403


In [5]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.backend import mean, sum
from keras.layers import Dense, Input, Dropout, LSTM, Activation, GRU, Concatenate, Multiply, Reshape
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.layers import Lambda
np.random.seed(1)
from keras.models import Sequential

def myUnstack(xin):
    return tf.unstack(xin, axis=1)

def ned_model(input_shape, mask_shape, triplets_shape, word_to_vec_map, word_to_index):
    """
    Function creating the context-vector-generator model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """

    # Create all the layers
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    sentence_gru1 = GRU( 50, return_sequences=True, name="ASentence_GRU1")
    sentence_gru2 = GRU( 50, return_sequences=True, name="ASentence_GRU2")
    triplets_gru1 = GRU( 50, batch_size=500, input_shape=triplets_shape, return_sequences=True, name="ATriplets_GRU1")
    triplets_gru2 = GRU( 50, return_sequences=True, name="ATriplets_GRU2")
    mean_layer = Lambda(lambda xin: mean(xin, axis=1), name="AMeanLayer")
    unstack_layer = Lambda(lambda xin: myUnstack(xin), name="AUnstackLayer")
    
    # sentence inputs
    sentence_indices = Input(shape=input_shape, dtype='int32', name="sentence_indices")   
    sentence_masks = Input(shape=mask_shape, dtype='float32', name="sentence_masks")
    
    # sentence embeddings
    sentence_embeddings = embedding_layer(sentence_indices)       
    X_sentence = sentence_gru2(sentence_gru1(sentence_embeddings))
    X_sentence = Multiply()([sentence_masks, X_sentence])
    X_sentence = mean_layer(X_sentence)
    
    #X_sentence = mean(X_sentence, axis=1) #1*60*50 if axis=0 it means it took average of all the X it recieved in this batch
    
    #X_sentence = tf.reduce_mean(tf.multiply(sentence_masks, X_sentence), axis=0)
    
    # triplets input
    triplets_indices = Input(shape=triplets_shape, dtype='int32', name="Atriplets_indices")   
   
    # triplets embeddings
    triplet_embeddings = embedding_layer(triplets_indices) 
    print(triplet_embeddings.get_shape())
    #r = Reshape((-1, 500, 300))(triplet_embeddings)#data.reshape((-1,20))
    unstacked_embeddings = unstack_layer(triplet_embeddings)
    o = Concatenate(axis=1)(unstacked_embeddings) # 0=(,60,50), 1 = (, 30000, 50), 2=(, 60, 25000)
    X_triplets = mean_layer(triplets_gru2(triplets_gru1(o)))
    
    # ATriplets_GRU1 (GRU)            (None, 60, 50)       3757650     concatenate_19[0][0]             for axis2 concatenate
    #outputs = [[[50]]]
#     embeddings = tf.unstack(triplet_embeddings, axis=1)
    #for index in range(len(unstacked_embeddings)):
     # outputs += triplets_gru1(unstacked_embeddings[index])
    
    #tf.keras.backend.stack(x,axis=0)

    #mean_outputs = mean_layer(outputs) # using it 
#     print(outputs.get_shape())
    
#     # now concat triplet and sentence embeddings - how to do?
    X_concatenated = Concatenate()([X_sentence, X_triplets])
    
#     # Propagate X through a Dense layer with softmax activation to get back a batch of 20-dimensional vectors.
    X_concatenated = Dense(20, activation='relu', name="Dense20")(X_concatenated)

    Y_pred = Dense(1, activation='sigmoid', name="Dense1")(X_concatenated)
    
    # Add a softmax activation
    #Y_pred = Activation('softmax')(X_concatenated)

    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=[sentence_indices, sentence_masks, triplets_indices], outputs=Y_pred)
#    model = Model(inputs=[triplets_indices], outputs=Y_pred)
  
  # talk 
    
    return model

Using TensorFlow backend.


In [17]:
# create inputs for model
def generate_model_inputs(inputFile, mask_dim):
    inputData = read_triplets(inputFile)
    triplet_indices = []
    Y = []
    sentence_indices = []
    sentence_masks = []
    for data in inputData:
        sentence_indices.append(sentences_to_indices(np.asarray([data['question']]),words_to_index, max_len = 60))
        sentence_masks.append(sentence_to_target_mask(data['name'], data['question'], dim=mask_dim))
        if 'wrong_triplets' in data:
            #print("wrong")
            triplet_indices.append(triplets_to_indices(data['wrong_triplets']))
            Y.append(0)
        else:
            #print("here")
            triplet_indices.append(triplets_to_indices(data['correct_triplets']))
            Y.append(1)
    return triplet_indices, Y, sentence_indices, sentence_masks

triplet_indices, Y, sentence_indices, sentence_masks = generate_model_inputs('/Users/surthi/Documents/cs230project/data_wiki_handshake/input_w_wiki_1.ipwk', mask_dim=50)
print(np.asarray(triplet_indices).shape)
print(np.asarray(Y).shape)
print(np.squeeze(np.asarray(sentence_indices)).shape)
print(np.asarray(sentence_masks).shape)

10
20
(20, 500, 60)
(20,)
(20, 60)
(20,)


In [16]:
maxWordsPerSentence=60
maxTriplets=500
model = ned_model((maxWordsPerSentence,), (maxWordsPerSentence,50,), (maxTriplets, maxWordsPerSentence,), word_to_vec_map, words_to_index)
model.summary()

(?, 500, 60, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sentence_indices (InputLayer)   (None, 60)           0                                            
__________________________________________________________________________________________________
Atriplets_indices (InputLayer)  (None, 500, 60)      0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             20000050    sentence_indices[0][0]           
                                                                 Atriplets_indices[0][0]          
__________________________________________________________________________________________________
AUnstackLayer (Lambda)          [(None, 60, 50), (No 0           embedding[1][0]            

In [28]:
triplet_indices[0].shape

(500, 60)