# Analysis of Neural Programmer

In [1]:
import copy
import itertools
import os
import pickle
import re
import string
import sys
import time
from random import shuffle

import autoreload
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from IPython.display import HTML, Image, clear_output, display
from scipy.spatial.distance import cosine

sys.path.append('../neural_programmer')

  from ._conv import register_converters as _register_converters


In [25]:
import notebook_utils
import data_utils
from neural_programmer import evaluate

In [3]:
%reload_ext autoreload
%autoreload 2

## Paths, parameters, etc. 

In [6]:
# Use only one GPU on the multi-GPU machine
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# WikiTableQuestions data
DATA_DIR = '../wtq_data'
PERTURBED_DATA_DIR = '../perturbed_wtq_data'

# Pretrained model
MODEL_FILE = os.path.join('..', 'pretrained_model', 'model_92500')

# Output directory to write attributions
OUT_DIR = '/tmp'

## Load data, build graph and restore pretrained weights

In [7]:
train_data, dev_data, test_data, utility = notebook_utils.init_data(DATA_DIR)

Annotated examples loaded  14152
Annotated examples loaded  4344
entry match token:  9133 9133
entry match token:  9134 9134
# train examples  10178
# dev examples  2546
# test examples  3913


In [9]:
tf.reset_default_graph()
sess, graph, params = notebook_utils.build_graph(utility)

forget gate bias
step:  0
step:  1
step:  2
step:  3
optimize params  ['unit', 'word', 'word_match_feature_column_name', 'controller', 'column_controller', 'column_controller_prev', 'controller_prev', 'question_lstm_ix', 'question_lstm_fx', 'question_lstm_cx', 'question_lstm_ox', 'question_lstm_im', 'question_lstm_fm', 'question_lstm_cm', 'question_lstm_om', 'question_lstm_i', 'question_lstm_f', 'question_lstm_c', 'question_lstm_o', 'history_recurrent', 'history_recurrent_bias', 'break_conditional']
grads:  Tensor("gradients_24/L2Loss_grad/mul:0", shape=(15, 256), dtype=float64) unit
grads:  Tensor("gradients_24/L2Loss_1_grad/mul:0", shape=(10800, 256), dtype=float64) word
grads:  Tensor("gradients_24/L2Loss_2_grad/mul:0", shape=(1,), dtype=float64) word_match_feature_column_name
grads:  Tensor("gradients_24/L2Loss_3_grad/mul:0", shape=(512, 256), dtype=float64) controller
grads:  Tensor("gradients_24/L2Loss_4_grad/mul:0", shape=(512, 256), dtype=float64) column_controller
grads:  Tens

In [10]:
sess, graph = notebook_utils.restore_model(sess, graph, params, MODEL_FILE)

INFO:tensorflow:Restoring parameters from ../pretrained_model/model_92500


In [14]:
evaluate(sess, dev_data, utility.FLAGS.batch_size, graph, 92500)

dev set accuracy   after  92500  :  0.3720472446811481
2540.0 2546
--------


## Apply Integrated Gradients (IG) 

In [68]:
# write attributions to this folder
attrs_outdir = os.path.join(OUT_DIR, 'attributions')
if not os.path.isdir(attrs_outdir):
    os.mkdir(attrs_outdir)

# get embedding of dummy token
embeddings = graph.params["word"].eval()
dummy_embedding = embeddings[utility.dummy_token_id, :]

# which data to use?
data = dev_data

# number of sample points for Riemann integral computation
num_points = 500

# hard coded stuff in the code
question_attention_mask_value = -10000.0

In [69]:
batch_size = graph.batch_size

for offset in range(0, len(data) - graph.batch_size + 1, graph.batch_size):
    feed_dict = data_utils.generate_feed_dict(data, offset, graph.batch_size, graph)

    # first run inference to get operator and column sequences, and embeddings of question words
    fetches = [graph.final_correct_list, graph.final_operation_softmax,
               graph.final_column_softmax, graph.question_words_embeddings]
    correct_list, operation_softmax, column_softmax, question_words_embeddings = sess.run(
        fetches, feed_dict)

    # compute table-specific default programs for tables in this batch
    feed_copy = feed_dict.copy()
    for t in graph.question_words_embeddings:
        feed_copy[t] = np.concatenate(
            [np.expand_dims(dummy_embedding, 0)]*batch_size, 0)

    # Ideally the following line should be uncommented, but for attributions,
    # we choose to keep this variable fixed. Note that this induces some bias
    # in the attributions as the baseline is no longer an "empty" question, but
    # an empty question where the question length is implicitly encoded in this variable
    # feed_copy[graph.batch_question_attention_mask].fill(question_attention_mask_value)

    feed_copy[graph.batch_exact_match] = np.zeros_like(
        feed_copy[graph.batch_exact_match])
    feed_copy[graph.batch_column_exact_match] = np.zeros_like(
        feed_copy[graph.batch_column_exact_match])

    fetches = [graph.final_operation_softmax, graph.final_column_softmax]

    default_operation_softmax, default_column_softmax = sess.run(
        fetches, feed_copy)

    for batch_id in range(batch_size):
        wiki_example = data[offset+batch_id]

        # get operator indices
        op_indices = np.argmax(operation_softmax[batch_id, :, :], axis=1)
        col_indices = np.argmax(column_softmax[batch_id, :, :], axis=1)

        op_list = notebook_utils.softmax_to_names(
            operation_softmax[batch_id, :, :], utility.operations_set)
        col_list = notebook_utils.softmax_to_names(
            column_softmax[batch_id, :, :], notebook_utils.get_column_names(wiki_example))

        default_op_list = notebook_utils.softmax_to_names(
            default_operation_softmax[0, :, :], utility.operations_set)
        default_col_list = notebook_utils.softmax_to_names(
            default_column_softmax[0, :, :], notebook_utils.get_column_names(wiki_example))

        print([notebook_utils.rename(w) for w in op_list])
        print(col_list)

        # Sample points along the integral path and collect them as one batch
        scaled_feed = feed_dict.copy()
        for key in list(scaled_feed.keys()):
            value = feed_dict[key]
            if key.shape[0] == batch_size:  # this is a hack
                scaled_feed[key] = [value[batch_id] for i in range(batch_size)]
        scaled_feed[graph.op_ids] = op_indices
        scaled_feed[graph.col_ids] = col_indices

        num_examples = batch_size * int(num_points/float(batch_size))
        scale = 1.0/num_examples

        batch_op_attribution = np.zeros(
            [graph.max_passes, graph.question_length+2], dtype=np.float32)
        batch_col_attribution = np.zeros(
            [graph.max_passes, graph.question_length+2], dtype=np.float32)

        attr_op_softmax = []
        attr_col_softmax = []

        actual_num_numeric_cols = len(wiki_example.original_nc_names)
        actual_num_word_cols = len(wiki_example.original_wc_names)

        exact_match = wiki_example.exact_match
        exact_column_match = wiki_example.exact_column_match

        batch_question_embeddings = np.array(question_words_embeddings)[
            :, batch_id, :]  # shape: 62 x 256

        # split up set of points into batch_size'd batches
        for k in range(0, num_examples, batch_size):
            print('k:', k)
            # scale question words to points between dummy_embedding and actual embedding
            qw_jump = [None]*graph.question_length
            for i, t in enumerate(graph.question_words_embeddings):
                qw_jump[i] = scale * \
                    (batch_question_embeddings[i] - dummy_embedding)
                scaled_feed[t] = [dummy_embedding + j*qw_jump[i]
                                  for j in range(k, k+batch_size)]

            # scale batch_exact_match
            scaled_exact_match = []
            scaled_column_exact_match = []

            exact_match_jump = [None]*(graph.num_cols + graph.num_word_cols)
            exact_column_match_jump = [None] * \
                (graph.num_cols + graph.num_word_cols)
            for i in range(graph.num_cols):
                if i < actual_num_numeric_cols:  # do not scale dummy columns
                    scaled_exact_match.append(np.expand_dims(
                        [j*scale*np.array(exact_match[i]) for j in range(k, k+batch_size)], 1))
                    exact_match_jump[i] = scale*np.array(exact_match[i])
                    scaled_column_exact_match.append(np.expand_dims(
                        [j*scale*np.array(exact_column_match[i]) for j in range(k, k+batch_size)], 1))
                    exact_column_match_jump[i] = scale * \
                        np.array(exact_column_match[i])
                else:
                    scaled_exact_match.append(np.expand_dims(
                        [exact_match[i] for j in range(k, k+batch_size)], 1))
                    exact_match_jump[i] = 0
                    scaled_column_exact_match.append(np.expand_dims(
                        [exact_column_match[i] for j in range(k, k+batch_size)], 1))
                    exact_column_match_jump[i] = 0

            for i in range(graph.num_word_cols):
                if i < actual_num_word_cols:  # do not scale dummy column names
                    scaled_exact_match.append(np.expand_dims(
                        [j*scale*np.array(exact_match[graph.num_cols+i]) for j in range(k, k+batch_size)], 1))
                    exact_match_jump[graph.num_cols + i] = scale * \
                        np.array(exact_match[graph.num_cols+i])
                    scaled_column_exact_match.append(np.expand_dims(
                        [j*scale*np.array(exact_column_match[graph.num_cols + i]) for j in range(k, k+batch_size)], 1))
                    exact_column_match_jump[graph.num_cols + i] = scale * \
                        np.array(exact_column_match[graph.num_cols + i])
                else:
                    scaled_exact_match.append(np.expand_dims(
                        [exact_match[graph.num_cols+i] for j in range(k, k+batch_size)], 1))
                    exact_match_jump[graph.num_cols + i] = 0
                    scaled_column_exact_match.append(np.expand_dims(
                        [exact_column_match[graph.num_cols + i] for j in range(k, k+batch_size)], 1))
                    exact_column_match_jump[graph.num_cols + i] = 0

            scaled_feed[graph.batch_exact_match] = np.concatenate(
                scaled_exact_match, 1)  # shape 20 x 40 x 100
            scaled_feed[graph.batch_column_exact_match] = np.concatenate(
                scaled_column_exact_match, 1)  # shape 20 x 40

            # compute gradients
            fetches = [graph.final_operation_softmax, graph.final_column_softmax, graph.operator_gradients,
                       graph.column_gradients]
            temp_op_softmax, temp_col_softmax, operator_gradients, column_gradients = sess.run(
                fetches, scaled_feed)  # operator gradient shape: 4 x 62 x 20 x 256

            attr_op_softmax.append(temp_op_softmax)
            attr_col_softmax.append(temp_col_softmax)

            # compute attributions
            for stage in range(graph.max_passes):
                n = int(len(operator_gradients)/graph.max_passes)
                temp = [np.sum(operator_gradients[n*stage][i]*qw_jump[i], axis=(0, 1))
                        for i in range(graph.question_length)]
                temp += [np.sum([operator_gradients[n*stage+1][0][:, i, :]*exact_match_jump[i]
                                 for i in range(graph.num_cols + graph.num_word_cols)])]
                temp += [np.sum([operator_gradients[n*stage+2][0][:, i]*exact_column_match_jump[i]
                                 for i in range(graph.num_cols + graph.num_word_cols)])]
                batch_op_attribution[stage, :] += temp

            for stage in range(graph.max_passes):
                n = int(len(column_gradients)/graph.max_passes)
                temp = [np.sum(column_gradients[n*stage][i]*qw_jump[i], axis=(0, 1))
                        for i in range(graph.question_length)]
                temp += [np.sum([column_gradients[n*stage+1][0][:, i, :]*exact_match_jump[i]
                                 for i in range(graph.num_cols + graph.num_word_cols)])]
                temp += [np.sum([column_gradients[n*stage+2][0][:, i]*exact_column_match_jump[i]
                                 for i in range(graph.num_cols + graph.num_word_cols)])]
                batch_col_attribution[stage, :] += temp

        # sanity check to make sure the integral summation adds up to function difference
        attr_op_softmax = np.concatenate(attr_op_softmax, axis=0)
        attr_col_softmax = np.concatenate(attr_col_softmax, axis=0)
        for stage in range(graph.max_passes):
            lhs = np.sum(batch_op_attribution[stage, :])
            input_fn_value = operation_softmax[batch_id,
                                               stage, op_indices[stage]]
            baseline_fn_value = attr_op_softmax[0, stage, op_indices[stage]]
            rhs = input_fn_value - baseline_fn_value
            print('OP', stage, ':', 'baseline=', baseline_fn_value, ', input_fn=',
                  input_fn_value, 'check: ', lhs, ' - ', rhs, ' = ', lhs-rhs)
        for stage in range(graph.max_passes):
            lhs = np.sum(batch_col_attribution[stage, :])
            input_fn_value = column_softmax[batch_id,
                                            stage, col_indices[stage]]
            baseline_fn_value = attr_col_softmax[0, stage, col_indices[stage]]
            rhs = input_fn_value - baseline_fn_value
            print('COL', stage, ':', 'baseline=', baseline_fn_value, ', input_fn=',
                  input_fn_value, 'check: ', lhs, ' - ', rhs, ' = ', lhs-rhs)

        op_attributions = [None]*graph.max_passes
        question_begin = np.nonzero(
            wiki_example.question_attention_mask)[0].shape[0]

        attributions_matrix = np.zeros(
            [graph.question_length - question_begin + 2, 2 * graph.max_passes])
        row_labels = []  # question words, tm, cm
        col_labels = []  # operator and column selections
        col_label_softmaxes = []  # softmaxes of the selections

        for ix in range(question_begin, graph.question_length):
            word = utility.reverse_word_ids[wiki_example.question[ix]]
            if word == utility.unk_token:
                word = word + '-' + [str(w) for w in wiki_example.string_question if w !=
                                     wiki_example.question_number and w != wiki_example.question_number_1][ix - question_begin]
            word = notebook_utils.rename(word)
            row_labels.append(word)
        row_labels.extend(['tm', 'cm'])

        for stage in range(graph.max_passes):
            col_labels.append(notebook_utils.rename(
                op_list[stage]) + ' (' + notebook_utils.rename(default_op_list[stage]) + ')')
            col_labels.append(notebook_utils.rename(
                col_list[stage]) + ' (' + notebook_utils.rename(default_col_list[stage]) + ')')

            col_label_softmaxes.append(str(operation_softmax[batch_id, stage, op_indices[stage]]) + ' (' + str(
                default_operation_softmax[batch_id, stage, op_indices[stage]]) + ')')
            col_label_softmaxes.append(str(column_softmax[batch_id, stage, col_indices[stage]]) + ' (' + str(
                default_column_softmax[batch_id, stage, col_indices[stage]]) + ')')

            attributions_matrix[:, 2 * stage] = batch_op_attribution[stage, question_begin:]
            attributions_matrix[:, 2 * stage +
                                1] = batch_col_attribution[stage, question_begin:]

        question_string = ' '.join([notebook_utils.rename(str(w))
                                    for w in wiki_example.string_question])

        # save operator and column selections to file
        with tf.gfile.GFile(os.path.join(attrs_outdir, wiki_example.question_id + '_labels.tsv'), 'w') as outf:
            outf.write(question_string)
            outf.write('\n')
            outf.write(str(correct_list[batch_id] == 1.0))
            outf.write('\n')
            outf.write('\t'.join(row_labels) + '\n')
            outf.write('\t'.join(col_labels) + '\n')
            outf.write('\t'.join(col_label_softmaxes) + '\n')

        # save attributions to file
        np.savetxt(os.path.join(
            attrs_outdir, wiki_example.question_id + '_attrs.txt'), attributions_matrix)


['select', 'prev', 'first', 'print']
['team', 'team', 'wins', 'team']
k: 0
k: 20
k: 40
k: 60
k: 80
k: 100
k: 120
k: 140
k: 160
k: 180
k: 200
k: 220
k: 240
k: 260
k: 280
k: 300
k: 320
k: 340
k: 360
k: 380
k: 400
k: 420
k: 440
k: 460
k: 480
OP 0 : baseline= 0.18709323784186127 , input_fn= 0.9895706953516668 check:  0.8022113  -  0.8024774575098055  =  -0.00026617287235430886
OP 1 : baseline= 0.000754526421322655 , input_fn= 0.9590392317258396 check:  0.9605387  -  0.9582847053045169  =  0.0022539800172909352
OP 2 : baseline= 0.0032642627771376923 , input_fn= 0.9986749874390007 check:  0.9976185  -  0.9954107246618631  =  0.0022077717561361787
OP 3 : baseline= 0.9975274390468174 , input_fn= 0.9999996349762448 check:  0.0024711965  -  0.0024721959294273788  =  -9.994642331534465e-07
COL 0 : baseline= 0.0059443340639456456 , input_fn= 0.9998552390240457 check:  0.99448097  -  0.9939109049601  =  0.0005700625615674415
COL 1 : baseline= 0.0045525129306662184 , input_fn= 0.5224792493280775 che

## Create HTML with visualizations

## Apply Integrated Gradients on table-specific default programs

In [75]:
# write attributions to this file
attrs_outdir = os.path.join(OUT_DIR, 'attributions_default_programs')
if not os.path.isdir(attrs_outdir):
    os.mkdir(attrs_outdir)

# get embedding of dummy token
embeddings = graph.params["word"].eval()
dummy_embedding = embeddings[utility.dummy_token_id, :]

# which data to use?
data = dev_data

# number of sample points for Riemann integral computation
num_points = 500

# hard coded stuff in the code
question_attention_mask_value = -10000.0

In [76]:
# collect all unique tables
unique_tables = {}
for wiki_example in data:
    if not wiki_example.table_key in unique_tables:
        wiki_example.exact_column_match = np.zeros_like(
            wiki_example.exact_column_match).tolist()
        wiki_example.exact_match = np.zeros_like(
            wiki_example.exact_match).tolist()
        wiki_example.question = [
            utility.dummy_token_id] * graph.question_length
        wiki_example.question_attention_mask = (question_attention_mask_value * \
            np.ones_like(wiki_example.question_attention_mask)).tolist()
        unique_tables[wiki_example.table_key] = wiki_example
data = list(unique_tables.values())

In [74]:
for offset in range(0, len(data) - graph.batch_size + 1, batch_size):

    feed_dict = data_utils.generate_feed_dict(data, offset, batch_size, graph)
    fetches = [graph.final_correct_list, graph.final_operation_softmax,
               graph.final_column_softmax, graph.column_hidden_vectors, graph.word_column_hidden_vectors]
    correct_list, operation_softmax, column_softmax, column_hidden_vectors, word_column_hidden_vectors = sess.run(
        fetches, feed_dict)

    # compute global default program
    feed_copy = feed_dict.copy()
    feed_copy[graph.column_hidden_vectors] = np.zeros(
        graph.column_hidden_vectors.get_shape().as_list())
    feed_copy[graph.word_column_hidden_vectors] = np.zeros(
        graph.word_column_hidden_vectors.get_shape().as_list())
    _, default_operation_softmax, default_column_softmax, _, _ = sess.run(
        fetches, feed_copy)

    for batch_id in range(batch_size):
        wiki_example = data[offset + batch_id]

        # get op indices
        op_indices = np.argmax(operation_softmax[batch_id, :, :], axis=1)
        col_indices = np.argmax(column_softmax[batch_id, :, :], axis=1)

        op_list = softmax_to_names(
            operation_softmax[batch_id, :, :], utility.operations_set)
        col_list = softmax_to_names(
            column_softmax[batch_id, :, :], get_column_names(wiki_example))

        print op_list
        print col_list

        # generate scaled feed
        scaled_feed = feed_dict.copy()
        for key in scaled_feed.keys():
            value = feed_dict[key]
            if key.shape[0] == batch_size:
                scaled_feed[key] = [value[batch_id] for i in range(batch_size)]
        scaled_feed[graph.op_ids] = op_indices
        scaled_feed[graph.col_ids] = col_indices

        num_examples = 25 * batch_size
        scale = 1.0 / num_examples

        batch_op_attribution = np.zeros(
            [graph.max_passes, graph.num_cols + graph.num_word_cols], dtype=np.float32)
        batch_col_attribution = np.zeros(
            [graph.max_passes, graph.num_cols + graph.num_word_cols], dtype=np.float32)
        attr_op_softmax = []
        attr_col_softmax = []

        actual_num_numeric_cols = len(wiki_example.original_nc_names)
        actual_num_word_cols = len(wiki_example.original_wc_names)
        numeric_column_name_jump = [None] * graph.num_cols
        word_column_name_jump = [None] * graph.num_word_cols
        for k in range(0, num_examples, batch_size):
            print 'k:', k
            scaled_numeric_column_names = []
            scaled_word_column_names = []

            for i in range(graph.num_cols):
                if i < actual_num_numeric_cols:  # do not scale dummy column
                    scaled_numeric_column_names.append(np.expand_dims(
                        [j * scale * np.array(column_hidden_vectors[batch_id, i, :]) for j in range(k, k + batch_size)], 1))
                    numeric_column_name_jump[i] = scale * \
                        np.array(column_hidden_vectors[batch_id, i, :])
                else:
                    scaled_numeric_column_names.append(np.expand_dims([np.array(
                        column_hidden_vectors[batch_id, i, :]) for j in range(k, k + batch_size)], 1))
                    numeric_column_name_jump[i] = 0

            for i in range(graph.num_word_cols):
                if i < actual_num_word_cols:  # do not scale dummy column names
                    scaled_word_column_names.append(np.expand_dims(
                        [j * scale * np.array(word_column_hidden_vectors[batch_id, i, :]) for j in range(k, k + batch_size)], 1))
                    word_column_name_jump[i] = scale * \
                        np.array(word_column_hidden_vectors[batch_id, i, :])
                else:
                    scaled_word_column_names.append(np.expand_dims([np.array(
                        word_column_hidden_vectors[batch_id, i, :]) for j in range(k, k + batch_size)], 1))
                    word_column_name_jump[i] = 0

            scaled_feed[graph.column_hidden_vectors] = np.concatenate(
                scaled_numeric_column_names, 1)  # shape 20 x 40 x 100
            scaled_feed[graph.word_column_hidden_vectors] = np.concatenate(
                scaled_word_column_names, 1)  # shape 20 x 40

            # compute gradients
            fetches = [graph.final_operation_softmax, graph.final_column_softmax,
                       graph.operator_gradients, graph.column_gradients]
            temp_op_softmax, temp_col_softmax, operator_gradients, column_gradients = sess.run(
                fetches, scaled_feed)  # operator gradient shape: 4 x 62 x 20 x 256

            attr_op_softmax.append(temp_op_softmax)
            attr_col_softmax.append(temp_col_softmax)

            # compute attributions
            for stage in range(graph.max_passes):
                n = len(operator_gradients) / graph.max_passes
                temp = [np.sum(operator_gradients[n * stage][0][:, i, :] *
                               numeric_column_name_jump[i]) for i in range(graph.num_cols)]
                temp += [np.sum(operator_gradients[n * stage + 1][0][:, i, :] *
                                word_column_name_jump[i]) for i in range(graph.num_word_cols)]
                batch_op_attribution[stage, :] += temp

            for stage in range(graph.max_passes):
                n = len(column_gradients) / graph.max_passes
                temp = [np.sum(column_gradients[n * stage][0][:, i, :] *
                               numeric_column_name_jump[i]) for i in range(graph.num_cols)]
                temp += [np.sum(column_gradients[n * stage + 1][0][:, i, :] *
                                word_column_name_jump[i]) for i in range(graph.num_word_cols)]
                batch_col_attribution[stage, :] += temp

        # sanity check
        attr_op_softmax = np.concatenate(attr_op_softmax, axis=0)
        attr_col_softmax = np.concatenate(attr_col_softmax, axis=0)
        for stage in range(graph.max_passes):
            lhs = np.sum(batch_op_attribution[stage, :])
            input_fn_value = operation_softmax[batch_id,
                                               stage, op_indices[stage]]
            baseline_fn_value = attr_op_softmax[0, stage, op_indices[stage]]
            rhs = input_fn_value - baseline_fn_value
            print 'OP', stage, ':', 'baseline=', baseline_fn_value, ', input_fn=', input_fn_value, 'check: ', lhs, ' - ', rhs, ' = ', lhs - rhs
        for stage in range(graph.max_passes):
            lhs = np.sum(batch_col_attribution[stage, :])
            input_fn_value = column_softmax[batch_id,
                                            stage, col_indices[stage]]
            baseline_fn_value = attr_col_softmax[0, stage, col_indices[stage]]
            rhs = input_fn_value - baseline_fn_value
            print 'COL', stage, ':', 'baseline=', baseline_fn_value, ', input_fn=', input_fn_value, 'check: ', lhs, ' - ', rhs, ' = ', lhs - rhs

        op_attributions = [None] * graph.max_passes
        for stage in range(graph.max_passes):
            cumsum_attr = np.cumsum(
                np.sort(np.abs(batch_op_attribution[stage]))[::-1])
            total_attr = cumsum_attr[-1]
            cumsum_attr = (1.0 * cumsum_attr) / \
                total_attr if total_attr > 0.0 else 0.0 * cumsum_attr
            if total_attr > 0.0:
                take_attr = min(num_attributions, np.nonzero(cumsum_attr > attribution_coverage)[
                                0][0] + 1, len(batch_op_attribution[0]))
#            take_attr = min(num_attributions, len(batch_op_attribution[0]))
            else:
                take_attr = 0
            attr_indices = np.argsort(np.abs(batch_op_attribution[stage]))[
                ::-1][:take_attr]
            attrs = [
                'baseline (' + str(round(attr_op_softmax[0, stage, op_indices[stage]], 6)) + ')<br>']
            for ix in attr_indices:
                attr_value = batch_op_attribution[stage][ix]
                if ix < graph.num_cols:
                    word = utility.reverse_word_ids[wiki_example.column_ids[ix][0]]
                else:
                    word = utility.reverse_word_ids[wiki_example.word_column_ids[ix-graph.num_cols][0]]
                attrs.append('<b>' + word + '</b>')
                attrs[-1] += '(' + str(round(attr_value, 3)) + ')'
            op_attributions[stage] = ', '.join(attrs)

        col_attributions = [None] * graph.max_passes
        for stage in range(graph.max_passes):
            cumsum_attr = np.cumsum(
                np.sort(np.abs(batch_col_attribution[stage]))[::-1])
            total_attr = cumsum_attr[-1]
            cumsum_attr = (1.0 * cumsum_attr) / \
                total_attr if total_attr > 0.0 else 0.0 * cumsum_attr
            if total_attr > 0.0:
                take_attr = min(num_attributions, np.nonzero(cumsum_attr > attribution_coverage)[
                                0][0] + 1, len(batch_col_attribution[0]))
#            take_attr = min(num_attributions, len(batch_col_attribution[0]))
            else:
                take_attr = 0
            attr_indices = np.argsort(np.abs(batch_col_attribution[stage]))[
                ::-1][:take_attr]
            attrs = [
                'baseline (' + str(round(attr_col_softmax[0, stage, col_indices[stage]], 6)) + ')<br>']
            for ix in attr_indices:
                attr_value = batch_col_attribution[stage][ix]
                if ix < graph.num_cols:
                    word = utility.reverse_word_ids[wiki_example.column_ids[ix][0]]
                else:
                    word = utility.reverse_word_ids[wiki_example.word_column_ids[ix-graph.num_cols][0]]
                attrs.append('<b>' + word + '</b>')
                attrs[-1] += '(' + str(round(attr_value, 3)) + ')'
            col_attributions[stage] = ', '.join(attrs)

        question_string = ' '.join([str(w)
                                    for w in wiki_example.string_question])
        if correct_list[batch_id] == 1.0:
            question_string = '<font color="green">' + question_string + '</font>'
        else:
            question_string = '<font color="red">' + question_string + '</font>'

        question_id = '<a href=https://wikitables.googleplex.com/table?table_name=' + \
            wiki_example.table_key[4:] + '&version=np_eval_on_unshuffled_data_of_original_model_trained_on_unshuffled_data>' + \
            wiki_example.question_id + '</a>'
        default_op_list = softmax_to_names(
            default_operation_softmax[batch_id, :, :], utility.operations_set)
        default_col_list = softmax_to_names(
            default_column_softmax[batch_id, :, :], get_column_names(wiki_example))
        print default_op_list
        print default_col_list

        combined_op_list = []
        combined_col_list = []
        stage = 0
        for op, dop in zip(op_list, default_op_list):
            dop_softmax_final = default_operation_softmax[batch_id,
                                                          stage, op_indices[stage]]
            op_line = op
            op_line += '<br><font color="green">' + dop + '</font>' + \
                '(' + str(round(dop_softmax_final, 3)) + ')'
            combined_op_list.append(op_line)
            stage += 1

        stage = 0
        for col, dcol in zip(col_list, default_col_list):
            dcol_softmax_final = default_column_softmax[batch_id,
                                                        stage, col_indices[stage]]
            col_line = col
            col_line += '<br><font color="green">' + dcol + '</font>' + \
                '(' + str(round(dcol_softmax_final, 3)) + ')'
            combined_col_list.append(col_line)
            stage += 1

        line = '\t'.join([question_id, question_string] + op_attributions +
                         col_attributions + combined_op_list + combined_col_list)
        print line
#        break
#    break
        outf.write(line + '\n')
        outf.flush()
outf.close()

330