In [1]:
import numpy as np
import torch
import pickle
import os
from packages.vocab import Vocab
from packages.batch import Batch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn.functional as F
from models.copynet_dbg import CopyEncoder, CopyDecoder
from models.functions import numpy_to_var, to_np, to_var, visualize, decoder_initial, update_logger
import time
import sys
import math
torch.manual_seed(1000)

<torch._C.Generator at 0x7f02e8f1c7f8>

In [10]:
# Hyperparameters
embed_size = 150
hidden_size = 300
num_layers = 1
bin_size = 10
num_epochs = 40
prev_end=0
batch_size = 50
lr = 0.001
vocab_size = 5000
weight_decay = 0.99
use_saved = True # whether to train from a previous model
continue_from = 14
step = 0 # number of steps taken
epoch = 0

In [4]:
# get vocabulary
vocab = Vocab(vocab_size)
vocab.w2i = np.load('js_dataset/w2i.npy').item()
vocab.i2w = np.load('js_dataset/i2w.npy').item()
vocab.count = len(vocab.w2i)

In [6]:
file_dir = 'js_dataset/var_dataset_3_shorter.txt'
# get training and test data
with open(file_dir) as f:
    lines = f.readlines()
test = [line.strip() for line in lines]

In [7]:
batch = Batch(file_list=[],max_in_len=30,max_out_len=30,max_oovs=12)
batch.num_of_minibatch=len(lines)/batch_size

In [8]:
# get number of batches
num_samples = len(test)
num_batches = int(num_samples/batch_size)

In [12]:
version = 'var_source_code2'
encoder = torch.load(f='170704/encoder_%s_%s.pckl' % (version,str(epoch+continue_from)))
decoder = torch.load(f='170704/decoder_%s_%s.pckl' % (version,str(epoch+continue_from)))



In [13]:
samples_read = 0
################################# testing ##################################

# 1. for each epoch
# 1.3. initialize entire batch data (no need...
batch.init_batch()

#. 1.4, for each minibatch
test2 = ['var assert = require ( " assert " ) ; var util:==:require (  " util "  ) ;',
         'var object = this . _ object ; var keys:==:this . _ keys ;',
         'var pageX = event . pageX ; var pageY:==:event . pageY ;',
         
         'var dayNamesShort = ( settings ? settings . dayNamesShort : null ) || '+
         'this . _ defaults . dayNamesShort ; var dayNames:==:' +
         '( settings ? settings . dayNames : null ) || this . _ defaults . dayNames ;',
         'var require = arr . require ; var bppp:==:arr . bppp ;',
         'var WebChannelDebug = goog . labs . net . webChannel .'+
         ' WebChannelDebug ; var Wire:==:goog . labs . net . webChannel . Wire ;',
         'var added = e . added ; var removed:==:e . removed ;',
         'var readdirSync = require (  " fs "  ) . readdirSync ; '+
         ' var statSync:==:require ( " fs " ) . statSync ;',
         'var chunk = entry . chunk ; var encoding:==:entry . encoding ;']
# test2 = test

correct = 0
total = len(test2)
print_list = []
while(samples_read<len(test2)):

    # 1.4.1. initialize gradient buffers
    batch.init_minibatch()

    # 1.4.2. obtain batch outputs
    data = test2[samples_read:min(samples_read+batch_size,len(test))]
    inputs, outputs = batch.process_minibatch(data,vocab)
    samples_read+=len(data)

    # 1.4.3. inputs and outputs must be unk-ed to put into model w/ limited vocab
    unked_inputs = batch.unk_minibatch(inputs,vocab)
    unked_outputs = batch.unk_minibatch(outputs,vocab)
    x = numpy_to_var(unked_inputs)
    y = numpy_to_var(unked_outputs)

    # 1.5. encoded outputs
    encoded, _ = encoder(x)

    # 1.6.1. get initial input of decoder
    decoder_in, s, w = decoder_initial(x.size(0))
    decoder_in = y[:,0]

    # 1.7. for each decoder timestep
    for j in range(y.size(1)-1): # for all sequences
        """
        decoder_in (Variable): [b]
        encoded (Variable): [b x seq x hid]
        input_out (np.array): [b x seq]
        s (Variable): [b x hid]
        """
        # 1.7.1.1st state - create [out]
        if j==0:
            out, s, w = decoder(input_idx=y[:,j], encoded=encoded,
                            encoded_idx=inputs, prev_state=s,
                            weighted=w, order=j)
#             out[2,0,vocab.w2i['codeMirror']]=1
        # remaining states - add results to [out]
        else:
            tmp_out, s, w = decoder(input_idx=unked_decoder_in.squeeze(), encoded=encoded,
                            encoded_idx=inputs, prev_state=s,
                            weighted=w, order=j)
            out = torch.cat([out,tmp_out],dim=1)
        # for debugging: stop if nan
        if math.isnan(w[-1][0][0].data[0]):
            print("NaN detected!")
            sys.exit()

        # 1.8.1. select next input
#         decoder_in = y[:,j] # train with ground truth
        if j==0:
            out[0,-1,vocab.w2i['(']]=1
        decoder_in = out[:,-1,:].max(1)[1] # train with prev outputs
        unked_decoder_in = batch.unk_minibatch(decoder_in.cpu().data.numpy(),vocab)
        unked_decoder_in = Variable(torch.LongTensor(unked_decoder_in).cuda())
    # 1.9.1. our targeted outputs should include OOV indices
    target_outputs = numpy_to_var(outputs[:,1:])

    # 1.9.2. get padded versions of target and output
    target = pack_padded_sequence(target_outputs,batch.output_lens.tolist(), batch_first=True)[0]
    pad_out = pack_padded_sequence(out,batch.output_lens.tolist(), batch_first=True)[0]
    for idx in range(len(data)):
        input_print = []
        truth_print = []
        predict_print = []
        for i in inputs[idx]:
            if i==0:
                break
            else:
                input_print.append(i)
        for i in outputs[idx]:
            if i==3:
                break
            elif i==2:
                pass
            else:
                truth_print.append(i)
        for i in out[idx,:,:].max(1)[1].cpu().data.numpy():
            if i==3:
                break
            else:
                predict_print.append(i)
        line0 = "\n==================================================================="
        line1 = 'Input1:       '+''.join(vocab.idx_list_to_word_list(input_print, batch.idx2oov_list[idx]))
        line2 = 'Output:       '+''.join(vocab.idx_list_to_word_list(truth_print, batch.idx2oov_list[idx]))
        line3 = 'Predict[UNK]: '+''.join(vocab.idx_list_to_word_list(predict_print))
        line4 = 'Predicted:    '+''.join(vocab.idx_list_to_word_list(predict_print, batch.idx2oov_list[idx]))
        line1 = line1.replace('var', 'var ')
        line1 = line1.replace(';',';\nInput2:       ')
        line2 = line2.replace('var', 'var ')
        line3 = line3.replace('var', 'var ')
        line4 = line4.replace('var', 'var ')
        if line2[14:]==line4[14:]:
            correct+=1
            line4+='\n***CORRECT***'
        print_list.extend([line0,line1,line2,line3,line4])
# with open('test_results_%s_epoch_%d_acc_%1.3f.txt' 
#           %(version,epoch+continue_from,correct*1.0/total),'w') as f:
#     f.write('\n'.join(print_list))
print(correct*1.0/total)

AttributeError: 'CopyEncoder' object has no attribute '_forward_pre_hooks'

In [18]:
print_list

 'Input1:       var dayNamesShort=(settings?settings.dayNamesShort:null)||this._defaults.dayNamesShort;\nInput2:       var dayNames',
 'Output:       (settings?settings.dayNames:null)||this._defaults.dayNames;',
 'Predict[UNK]: (settings?settings.dayNames:null)||settings.dayNamesShort;',
 'Predicted:    (settings?settings.dayNames:null)||settings.dayNamesShort;',
 'Input1:       var WebChannelDebug=goog.labs.net.webChannel.WebChannelDebug;\nInput2:       var Wire',
 'Output:       goog.labs.net.webChannel.Wire;',
 'Predict[UNK]: goog.labs.net.<UNK>.<UNK>;',
 'Predicted:    goog.labs.net.Wire.Wire;',
 'Input1:       var readdirSync=require( "fs" ).readdirSync;\nInput2:        var statSync',
 'Output:       require("fs").statSync;',
 'Predict[UNK]: require("path").statSync;',
 'Predicted:    require("path").statSync;',
 'Input1:       var assert=require("assert");\nInput2:       var util',
 'Output:       require( "util" );',
 'Predict[UNK]: require("util");',
 'Predicted:    require("ut

In [None]:
out[2,:,vocab.w2i['fs']]

In [None]:
unked = batch.unk_minibatch(outputs[idx],vocab)
' '.join(vocab.idx_list_to_word_list(unked,batch.idx2oov_list[idx]))

In [None]:
' '.join(vocab.idx_list_to_word_list(stories[idx],batch.idx2oov_list[idx]))
unked = batch.unk_minibatch(summaries[idx],vocab)
' '.join(vocab.idx_list_to_word_list(unked,batch.idx2oov_list[idx]))

In [None]:
' '.join(vocab.idx_list_to_word_list(summaries[idx],batch.idx2oov_list[idx]))

In [None]:
' '.join(vocab.idx_list_to_word_list(summaries[idx]))

In [None]:
batch.oov2idx_list[idx]

In [None]:
a = torch.Tensor(3,5,2)

In [None]:
vocab.w2i['<EOS>']

In [None]:
# out : [b x seq x vocab] -> [b x seq]