In [None]:
import scipy.stats as stats
import numpy as np
import keras.backend as K
import tensorflow as tf
from OrderingModels import *
import pickle


In [None]:
data_folder = ''# Fill out location of sentences and permutations file created by data preparation notebooks
modelFileName = ''# Fill out name of trained model file
split_at = 30
batch_size = 100
max_seq_len = 10
max_sent_len = 40
max_num_word = 2000
EMBEDDING_DIM = 300
usePointerBasedLSTM = False # Work in progress
useWordLevelEmbeddings = False
usePretrainedWordEmbeddings = False

In [None]:
def calulateKendallTau(y_true, y_pred):
#     y_true = K.eval(y_true)
#     y_pred = K.eval(y_pred)
    
    y_true_indexed = (np.argmax(y_true, axis=-1) + 1)
    y_pred_indexed = (np.argmax(y_pred, axis=-1) + 1)
    
    
    corrected_y_pred_indexed = (y_true_indexed != max_seq_len) * y_pred_indexed
    corrected_y_pred_indexed[corrected_y_pred_indexed == 0] = max_seq_len
    
    kendal_tau_list = []
    for i in range(y_true_indexed.shape[0]):
#         print(stats.kendalltau(y_true_indexed[i],corrected_y_pred_indexed[i])[0])
        kendal_tau_list.append(stats.kendalltau(y_true_indexed[i],corrected_y_pred_indexed[i])[0])
    
    return np.mean(kendal_tau_list)
    

In [None]:
y = np.loadtxt(data_folder + 'permutations.txt', delimiter='\t', dtype=int)

with open(data_folder + 'sentences.txt', encoding='utf8') as f:
    sentences = f.readlines()

inc = 1
if(usePointerBasedLSTM):
    # pointer lstm assumes output decision space has equal length to input.
    inc = 0 

YY = []
for y_ in y:
    yy = []
    dummyVec = np.array([ 0 for i in range(max_seq_len + inc)])
    for yy_ in y_:
        dummyVec[yy_] = 1
        yy.append(np.copy(dummyVec))
        dummyVec[yy_] = 0
    YY.append(yy)

YY = np.asarray(YY)

if(useWordLevelEmbeddings):
    tokenizer_file_extra = ''
    embeddings_index = {}
    with open('./Pretrained/glove.6B/glove.6B.300d.txt', encoding="utf8") as f:
        for line in f.readlines():
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs
    
    if(usePretrainedWordEmbeddings):
        tokenizer_file_extra = '_pretrained'
        
    with open('tokenizer' + tokenizer_file_extra + '.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    X = []
    for line in sentences:
        lineSents = line.split('\t')
        xx = np.zeros((max_seq_len, max_sent_len))
        j = 0
        for ls in lineSents:
            ls_vec = tokenizer.texts_to_sequences([ls])
            ls_vec = pad_sequences(ls_vec, maxlen=max_sent_len)
            xx[j] = np.copy(ls_vec[0])
            j+= 1
        X.append(np.copy(xx))

    X = np.asarray(X)

    word_index = tokenizer.word_index
    num_words = min(max_num_word, len(word_index) + 1)

    if(usePretrainedWordEmbeddings):
        embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
        for word, i in word_index.items():
            if i >= max_num_word:
                continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        
        embedding_layer = Embedding(num_words, EMBEDDING_DIM, input_length=max_sent_len, embeddings_initializer=Constant(embedding_matrix), trainable=False)

    else:
        embedding_layer = Embedding(num_words, EMBEDDING_DIM, input_length=max_sent_len, trainable=True)

else:
    X = []
    for line in sentences:
        lineSents = line.split('\t')
        xx = [[" "] for i in range(max_seq_len)]
        j = 0
        for ls in lineSents:
            xx[j][0] = ls
            j += 1
        X.append(xx)
    
    X = np.asarray(X)
    
x_test = X[split_at:]

y_test = YY[split_at:]

In [None]:
model = BiLstmBased(max_seq_len)

In [None]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights('./Models/' + modelFileName)
    y_pred = model.predict(x_test)

In [None]:
y_pred_onehot = []
for y_ in y_pred:
    yy = []
    dummyVec = np.array([ 0 for i in range(max_seq_len + inc)])
    mask = np.ones(max_seq_len + inc)
    for yy_ in y_:
        yy_ = yy_*(mask) # ensure only unique positions are selected
        dummyVec[np.argmax(yy_)] = 1
        mask[np.argmax(yy_)] = 0 # remove alreaddy selected positions in subsequent selections
        mask[-1] = 1 # Filler position can be selected at anytime
        yy.append(np.copy(dummyVec))
        dummyVec[np.argmax(yy_)] = 0
    y_pred_onehot.append(yy)

y_pred_onehot = np.asarray(y_pred_onehot)

In [None]:
calulateKendallTau(y_test,y_pred_onehot)