In [1]:
from seq2seq.gSCAN_dataset import GroundedScanDataset
from seq2seq.model import Model
from seq2seq.rollout import Rollout
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import inspect
import numpy as np
device = torch.device(type='cuda')

In [2]:
training_set = GroundedScanDataset("gSCAN_data/data/compositional_splits/dataset.txt", "gSCAN_data/data/compositional_splits/", split="train", 
                                        input_vocabulary_file="training_input_vocab.txt", 
                                        target_vocabulary_file="training_target_vocab.txt", 
                                        generate_vocabulary=False, k=0) 

In [3]:
testing_set = GroundedScanDataset("gSCAN_data/data/compositional_splits/dataset.txt", "gSCAN_data/data/compositional_splits/", split="test", 
                                        input_vocabulary_file="training_input_vocab.txt", 
                                        target_vocabulary_file="training_target_vocab.txt", 
                                        generate_vocabulary=False, k=0) 

In [18]:
example = training_set.dataset.get_examples_with_image("train", True)

In [None]:
for i, e in enumerate(example):
    print(e)
    input_commands = e["input_command"]
    target_commands = e["target_command"]
    situation_image = e["situation_image"]
    plt.imshow(situation_image[:,:,3])
    if i == 2:
        break

In [21]:
test_example = testing_set.dataset.get_examples_with_image("test", True)

In [None]:
for i, e in enumerate(test_example):
    print(e)
    input_commands = e["input_command"]
    target_commands = e["target_command"]
    situation_image = e["situation_image"]
    plt.imshow(situation_image[:,:,3])
    break

In [4]:
training_set.read_dataset(max_examples=100,
                              simple_situation_representation=True)

In [5]:
cfg = {'data_path': 'gSCAN_data/data/compositional_splits/dataset.txt', 'data_directory': 'gSCAN_data/data/compositional_splits/', 'generate_vocabularies': False, 'input_vocab_path': 'training_input_vocab.txt', 'target_vocab_path': 'training_target_vocab.txt', 'embedding_dimension': 25, 'num_encoder_layers': 1, 'encoder_dropout_p': 0.3, 'encoder_bidirectional': True, 'training_batch_size': 50, 'test_batch_size': 1, 'max_decoding_steps': 30, 'num_decoder_layers': 1, 'decoder_dropout_p': 0.3, 'cnn_kernel_size': 7, 'cnn_dropout_p': 0.1, 'cnn_hidden_num_channels': 50, 'simple_situation_representation': True, 'decoder_hidden_size': 100, 'encoder_hidden_size': 100, 'learning_rate': 0.001, 'adam_beta_1': 0.9, 'adam_beta_2': 0.999, 'resume_from_file': '', 'max_training_iterations': 200000, 'output_directory': 'models', 'print_every': 100, 'evaluate_every': 1000, 'conditional_attention': True, 'auxiliary_task': False, 'weight_target_loss': 0.3, 'attention_type': 'bahdanau', 'k': 0, 'max_training_examples': None, 'seed': 42, 'kwargs': {'mode': 'train', 'split': 'test', 'max_testing_examples': None, 'splits': 'test', 'output_file_name': 'predict.json'}, 'device': device, 'lr_decay': 0.9, 'lr_decay_steps': 20000}

In [6]:
# model = Model(input_vocabulary_size=training_set.input_vocabulary_size,
#                   target_vocabulary_size=training_set.target_vocabulary_size,
#                   num_cnn_channels=training_set.image_channels,
#                   input_padding_idx=training_set.input_vocabulary.pad_idx,
#                   target_pad_idx=training_set.target_vocabulary.pad_idx,
#                   target_eos_idx=training_set.target_vocabulary.eos_idx,
#                   embedding_dimension=25, encoder_hidden_size=100, num_encoder_layers=1,
#                   encoder_dropout_p=0.3, encoder_bidirectional=True, num_decoder_layers=1,
#                   decoder_dropout_p=0.3, decoder_hidden_size=100, cnn_kernel_size=7, cnn_dropout_p=0.1,
#                   cnn_hidden_num_channels=50, output_directory="models", conditional_attention=True, auxiliary_task=False,
#                   simple_situation_representation=True, attention_type="bahdanau")
model = Model(input_vocabulary_size=training_set.input_vocabulary_size,
                  target_vocabulary_size=training_set.target_vocabulary_size,
                  num_cnn_channels=training_set.image_channels,
                  input_padding_idx=training_set.input_vocabulary.pad_idx,
                  target_pad_idx=training_set.target_vocabulary.pad_idx,
                  target_eos_idx=training_set.target_vocabulary.eos_idx,
                  **cfg)
model = model.cuda()



In [7]:
model.train()

Model(
  (situation_encoder): ConvolutionalNet(
    (conv_1): Conv2d(16, 50, kernel_size=(1, 1), stride=(1, 1))
    (conv_2): Conv2d(16, 50, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (conv_3): Conv2d(16, 50, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
    (dropout): Dropout(p=0.1, inplace=False)
    (relu): ReLU()
    (layers): Sequential(
      (0): ReLU()
      (1): Dropout(p=0.1, inplace=False)
    )
  )
  (visual_attention): Attention(
    (key_layer): Linear(in_features=150, out_features=100, bias=False)
    (query_layer): Linear(in_features=100, out_features=100, bias=False)
    (energy_layer): Linear(in_features=100, out_features=1, bias=False)
  )
  (encoder): EncoderRNN(
    EncoderRNN
     bidirectional=True 
     num_layers=1
     hidden_size=100
     dropout=0.3
     n_input_symbols=21
    
    (embedding): Embedding(21, 25, padding_idx=0)
    (dropout): Dropout(p=0.3, inplace=False)
    (lstm): LSTM(25, 100, dropout=0.3, bidirectional=True)
  )
  (enc_h

In [8]:
rollout = Rollout(model, 0.8)

In [9]:
for (input_batch, input_lengths, _, situation_batch, _, target_batch, target_lengths, agent_positions, target_positions) in training_set.get_data_iterator(batch_size=10):
    target_scores, target_position_scores = model(commands_input=input_batch, commands_lengths=input_lengths,
                                                          situations_input=situation_batch, target_batch=target_batch,
                                                          target_lengths=target_lengths)
    print(target_scores, target_position_scores)
    break

tensor([[[-2.1977, -2.2715, -2.2845, -2.3473, -1.8754, -2.3150, -2.4746,
          -2.3151, -1.8752],
         [-2.1626, -2.3288, -1.8263, -2.6755, -2.6097, -2.1257, -2.1233,
          -2.2709, -1.9525],
         [-2.0654, -2.4855, -2.1612, -2.2177, -2.3129, -2.1729, -2.1947,
          -2.1692, -2.0604],
         [-2.0411, -2.1134, -2.2031, -1.9930, -2.3282, -2.2059, -2.5773,
          -2.1337, -2.2972],
         [-2.1747, -2.2630, -2.2495, -1.9562, -2.1982, -2.4407, -2.1477,
          -2.1214, -2.2951],
         [-2.1939, -2.1876, -2.1612, -2.2024, -2.2294, -2.2020, -2.2232,
          -2.1877, -2.1893],
         [-2.1901, -2.1916, -2.1687, -2.2068, -2.2194, -2.2044, -2.2231,
          -2.1827, -2.1896],
         [-2.1877, -2.1950, -2.1716, -2.2082, -2.2146, -2.2051, -2.2222,
          -2.1817, -2.1900],
         [-2.1862, -2.1973, -2.1727, -2.2085, -2.2124, -2.2052, -2.2213,
          -2.1820, -2.1905],
         [-2.1852, -2.1987, -2.1731, -2.2084, -2.2114, -2.2050, -2.2205,
         

  memory_lengths = torch.tensor(memory_lengths, dtype=torch.long, device=device)


In [9]:
samples = model.sample(F.log_softmax(target_scores, dim=-1).max(dim=-1)[1].detach()[:,:6], 
             commands_input=input_batch,
             commands_lengths=input_lengths,
             situations_input=situation_batch, 
             target_batch=target_batch,
             sos_idx=testing_set.target_vocabulary.sos_idx, eos_idx=testing_set.target_vocabulary.eos_idx)

In [11]:
target_scores.shape

torch.Size([10, 11, 9])

In [10]:
np.array(samples).T

array([[1, 0, 4, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 2, 8, 8, 8, 8, 8, 2, 1, 0, 8, 3, 4, 2, 0, 8, 3, 4, 8, 4, 8,
        2, 8, 2, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 5, 8, 8, 2, 8, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 8, 3, 8, 8, 2, 8, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 4, 5, 8, 8, 8, 8, 3, 4, 8, 2, 8, 2, 8, 2, 0, 2, 6, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 5, 8, 3, 8, 8, 8, 2, 8, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 4, 2, 8, 8, 3, 8, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 5, 8, 8, 2, 8, 5, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 5, 8, 4, 8, 8, 8, 7, 3, 5, 8, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0,

In [30]:
F.log_softmax(target_scores, dim=-1).max(dim=-1)[1].detach().chunk(11, dim=1)[1:].szie()

AttributeError: 'tuple' object has no attribute 'szie'

In [11]:
np.array(model.sample(F.log_softmax(target_scores, dim=-1).max(dim=-1)[1].detach()[:,:6], 
             commands_input=input_batch,
             commands_lengths=input_lengths,
             situations_input=situation_batch, 
             target_batch=target_batch,
             sos_idx=testing_set.target_vocabulary.sos_idx, eos_idx=testing_set.target_vocabulary.eos_idx)).T

array([[1, 0, 4, 8, 8, 8, 8, 5, 8, 2, 0, 8, 8, 2, 8, 2, 1, 0, 8, 3, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 2, 8, 8, 8, 8, 8, 3, 5, 3, 4, 2, 8, 4, 8, 2, 8, 4, 8, 2, 6,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 5, 8, 8, 2, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 8, 3, 8, 8, 8, 2, 2, 8, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 4, 5, 8, 8, 8, 8, 3, 8, 8, 3, 4, 5, 3, 2, 6, 2, 8, 1, 0, 8,
        2, 1, 0, 8, 2, 1, 0, 0, 0, 0],
       [1, 0, 5, 8, 3, 8, 8, 8, 3, 5, 3, 4, 8, 2, 8, 2, 8, 2, 1, 0, 8, 2,
        6, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 4, 2, 8, 8, 3, 8, 4, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 5, 8, 8, 2, 8, 2, 8, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 5, 8, 4, 8, 8, 5, 8, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0,

In [15]:
target_batch

tensor([[1, 3, 4, 5, 2, 0, 0, 0, 0, 0, 0],
        [1, 3, 4, 5, 4, 5, 2, 0, 0, 0, 0],
        [1, 3, 4, 5, 4, 5, 2, 0, 0, 0, 0],
        [1, 3, 4, 5, 4, 5, 2, 0, 0, 0, 0],
        [1, 3, 4, 5, 4, 5, 4, 5, 2, 0, 0],
        [1, 3, 4, 5, 4, 5, 4, 5, 2, 0, 0],
        [1, 3, 4, 5, 4, 5, 4, 5, 2, 0, 0],
        [1, 3, 4, 5, 4, 5, 4, 5, 2, 0, 0],
        [1, 3, 4, 5, 4, 5, 4, 5, 4, 5, 2],
        [1, 3, 4, 5, 4, 5, 4, 5, 4, 5, 2]], device='cuda:0')

In [18]:
F.log_softmax(target_scores, dim=-1).max(dim=-1)

torch.return_types.max(
values=tensor([[-1.7702, -1.9953, -1.9954, -1.9774, -1.9521, -2.1726, -2.1783, -2.1754,
         -2.1732, -2.1720, -2.1714],
        [-1.9499, -1.9965, -1.9423, -1.9491, -1.9146, -1.9209, -1.9605, -2.1648,
         -2.1753, -2.1814, -2.1796],
        [-1.6846, -1.9704, -1.7049, -1.8677, -2.0024, -1.9253, -1.8574, -2.1740,
         -2.1812, -2.1766, -2.1727],
        [-1.7634, -2.0047, -1.9297, -1.7189, -1.9774, -1.8752, -2.0173, -2.1670,
         -2.1744, -2.1758, -2.1734],
        [-1.6413, -1.8887, -1.9781, -1.7050, -1.9861, -1.9262, -1.8829, -1.8626,
         -1.8369, -2.1745, -2.1835],
        [-1.6450, -1.8827, -2.0399, -1.8861, -1.9740, -1.8769, -1.9888, -1.8354,
         -2.0403, -2.1730, -2.1838],
        [-1.6865, -1.8132, -1.9273, -2.0015, -1.9260, -1.8817, -1.9189, -1.8948,
         -1.9843, -2.1673, -2.1733],
        [-1.7344, -1.9161, -2.0697, -1.7953, -1.8654, -1.7896, -1.8749, -2.0628,
         -1.8690, -2.1618, -2.1725],
        [-1.8241, -1.9797

In [10]:
def reward_func(pred):
    return 0.2

In [11]:
rollout.get_reward(target_scores, 16, input_batch, input_lengths, situation_batch, target_batch, testing_set.target_vocabulary.sos_idx, testing_set.target_vocabulary.eos_idx, reward_func)

array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2])