In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1013)

<torch._C.Generator at 0x7fc411f95ab0>

### Part 1: Prepare the data

In [33]:
import pandas as pd
df = pd.read_csv("codegen.csv")
data_x = df["utterance"]
data_y = df["targets"]

"""
# import dataset
with open("calculator.dataset", "r") as f:
    lines = f.readlines()
    
data_x, data_y = [], []
for line in lines:
    if (line[0] == "("):
        data_y.append(line.strip())
    elif (line != "\n"):
        data_x.append(line.strip())
"""
# split into test/train data
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2)

test_x.loc[38] = "What is the minimum humidity?"
test_y.loc[38] = "min( WeatherHistory [ 'Humidity' ] )"


Building input and output vocabulary.


In [34]:
from collections import Counter

class Vocabulary():
    END_OF_SENTENCE = '<end>'
    NULL = 'NULL'
    UNKNOWN = 'UNK'
    END_OF_SENTENCE_INDEX = 2
    def __init__(self):
        self.tok2ind = {self.NULL: 0, self.UNKNOWN: 1, self.END_OF_SENTENCE: 2}
        self.ind2tok = {0: self.NULL, 1: self.UNKNOWN, 2: self.END_OF_SENTENCE}
    
    def add(self, token):
        if token not in self.tok2ind:
            index = len(self.tok2ind)
            self.tok2ind[token] = index
            self.ind2tok[index] = token
    
    def __len__(self):
        return len(self.tok2ind)
    
    def get_index(self, word):
        if word in self.tok2ind:
            return self.tok2ind[word]
        return self.tok2ind[self.UNKNOWN]
    
    def get_word(self, i):
        return self.ind2tok[i]

    def sentence_to_indices(self, sentence):
        words = [x for x in sentence.split(' ')]
        words.append(self.END_OF_SENTENCE)
        indices = [self.get_index(w) for w in words]
        return indices

def build_vocab(examples):
    counts = Counter()
    for ex in examples:
        words = [w for w in ex.split(' ') if w.strip()]
        counts.update(words)
    
    word_list = [w for w in counts if counts[w] > 1]
    
    word_dict = Vocabulary()
    for w in word_list:
        word_dict.add(w)
    return word_dict

input_vocab = build_vocab(train_x)
output_vocab = build_vocab(train_y)
print(input_vocab.tok2ind)
print(output_vocab.tok2ind)

{'NULL': 0, 'UNK': 1, '<end>': 2, 'What': 3, 'is': 4, 'the': 5, 'lowest': 6, 'temperature': 7, 'value?': 8, 'to': 9, 'how': 10, 'it': 11, 'Predict': 12, 'linear': 13, 'model': 14, 'when': 15, 'at': 16, '12': 17, 'degrees.': 18, 'Tell': 19, 'me': 20, 'what': 21, 'minimum': 22, 'What’s': 23, 'today?': 24, 'Can': 25, 'you': 26, 'average': 27, 'of': 28, 'all': 29, 'temperatures': 30, 'in': 31, 'weather': 32, 'history': 33, 'data': 34, 'base.': 35, 'temperature.': 36, "What's": 37, 'highest': 38, 'How': 39, 'does': 40, 'influence': 41, 'feeling_temperature?': 42, 'have': 43, 'affect': 44, 'on': 45, 'humidity?': 46, 'Give': 47, 'actual': 48, 'and': 49, 'feeling': 50, 'predict': 51, 'value': 52, 'humidity': 53, 'would': 54, 'relationship': 55, 'between': 56, 'Of': 57, 'listed': 58, 'values,': 59, 'which': 60, 'your': 61, 'prediction': 62, 'Calculate': 63, 'correlation': 64, 'variables': 65, 'correlated': 66, 'from': 67, 'temperature?': 68, 'feel': 69, 'like': 70, 'outside,': 71, 'tell': 72, '

Process training and test datasets.

In [35]:
from torch.utils.data import Dataset
from torch.utils.data.sampler import Sampler

class Example():
    def __init__(self, x_str, y_str, input_vocab, output_vocab):
        self.x_str = x_str
        self.y_str = y_str
        self.x_toks = x_str.split(' ')
        self.y_toks = y_str.split(' ')
        
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.x_inds = torch.LongTensor(input_vocab.sentence_to_indices(x_str))
        self.y_inds = torch.LongTensor(output_vocab.sentence_to_indices(y_str))
        
        # for copying
        self.y_in_x_inds = torch.FloatTensor(([[int(x_tok == y_tok) for x_tok in self.x_toks] for y_tok in self.y_toks])) 

# In order to use PyTorch's data loader
class ReaderDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        return self.examples[index]
    
train_exs = []
for x,y in zip(train_x, train_y):
    train_exs.append(Example(x, y, input_vocab, output_vocab))
train_dataset = ReaderDataset(train_exs)

test_exs = []
for x,y in zip(test_x, test_y):
    test_exs.append(Example(x, y, input_vocab, output_vocab))
test_dataset = ReaderDataset(test_exs)

for x in test_dataset:
    print(x.y_in_x_inds)


    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
[torch.FloatTensor of size 11x7]


    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     

Vectorize individual examples and organize them into batches.

In [36]:
# vectorize batch data
def vectorize(batch):
    max_input_length = max([ex.x_inds.size(0) for ex in batch])
    x = torch.LongTensor(len(batch), max_input_length).zero_() # initialize to 0
    x_mask = torch.ByteTensor(len(batch), max_input_length).fill_(1) # mask used in softmax
    x_lens = torch.LongTensor(len(batch)).zero_()
    for i, ex in enumerate(batch):
        x[i, :ex.x_inds.size(0)].copy_(ex.x_inds)
        x_mask[i, :ex.x_inds.size(0)].fill_(0)
        ###CHANGE: x_lens store the last index of each sequence. i*max_input_length is added so that later we can use 
        ###torch.index_select to get the last hidden states from a 2D tensor (batch_size*max_input_length, embedding_dim)
        x_lens[i] = i*max_input_length+ex.x_inds.size(0)-1 
    
    max_output_length = max([ex.y_inds.size(0) for ex in batch])
    y = torch.LongTensor(len(batch), max_output_length).zero_()
    y_mask = torch.ByteTensor(len(batch), max_output_length).zero_() # for masked_select
    for i, ex in enumerate(batch):
        y[i, :ex.y_inds.size(0)].copy_(ex.y_inds)
        y_mask[i, :ex.y_inds.size(0)].fill_(1)
    
    # for copying
    y_in_x_inds = torch.FloatTensor(len(batch), max_output_length, max_input_length).zero_()
    for i, ex in enumerate(batch):
        y_in_x_inds[i, :ex.y_in_x_inds.size(0), :ex.y_in_x_inds.size(1)].copy_(ex.y_in_x_inds)

    return x, x_lens, x_mask, y, y_mask, y_in_x_inds

train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=100, ## the batch_size can be tuned
    sampler=train_sampler,
    num_workers=1,
    collate_fn=vectorize
)

test_sampler = torch.utils.data.sampler.SequentialSampler(test_dataset)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=1, ## the batch_size can be tuned
    sampler=test_sampler,
    num_workers=1,
    collate_fn=vectorize
)

for x in train_loader:
    print(x)
    break

100
51
(
   25    26     1  ...      0     0     0
  123    58     7  ...      0     0     0
    3     4     5  ...      0     0     0
       ...          ⋱          ...       
    4   128   101  ...      0     0     0
   39     1    40  ...      0     0     0
   37     5    38  ...      0     0     0
[torch.LongTensor of size 100x22]
, 
   14
   28
   54
   74
   96
  116
  140
  159
  185
  210
  227
  249
  272
  291
  322
  336
  362
  386
  402
  430
  445
  466
  492
  517
  538
  556
  585
  604
  625
  646
  666
  687
  709
  735
  753
  776
  804
  819
  857
  866
  885
  907
  934
  954
  976
  997
 1021
 1039
 1062
 1088
 1108
 1132
 1150
 1173
 1194
 1216
 1241
 1267
 1288
 1308
 1325
 1349
 1370
 1392
 1416
 1439
 1456
 1487
 1503
 1529
 1546
 1567
 1591
 1613
 1633
 1658
 1677
 1703
 1723
 1746
 1764
 1788
 1814
 1836
 1859
 1883
 1901
 1931
 1941
 1966
 1989
 2013
 2030
 2056
 2080
 2096
 2122
 2142
 2162
 2185
[torch.LongTensor of size 100]
, 
    0     0     0  ...    

### Part 2 Build the seq2seq model

In [37]:
# stack bidirectional LSTM
class StackBRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(StackBRNN, self).__init__()
        
        self.num_layers = num_layers
        
        self.rnns = nn.ModuleList()
        for i in range(num_layers):
            input_dim = input_dim if i == 0 else hidden_dim * 2
            self.rnns.append(nn.LSTM(input_dim, hidden_dim, bidirectional=True))
        
    def forward(self, x):
        # Transpose batch and sequence dims
        x = x.transpose(0, 1) # (seq_len, batch_size, input_dim)

        outputs = [x]
        for i in range(self.num_layers):
            rnn_input = outputs[-1]
            rnn_output = self.rnns[i](rnn_input)[0]
            outputs.append(rnn_output)

        h_output = outputs[-1]

        # Transpose back
        h_output = h_output.transpose(0, 1) # (batch_size, seq_len, 2*hidden_dim)
        
        return h_output

#### Part 2.1: Define the basic seq2seq model

In [38]:
class Seq2Seq(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, input_vocab, output_vocab, copying=False):
        super(Seq2Seq, self).__init__()
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.in_vocab_size = len(self.input_vocab)
        self.out_vocab_size = len(self.output_vocab)
        
        self.in_embedding = nn.Embedding(self.in_vocab_size, embedding_dim, padding_idx=0)
        self.encoder = StackBRNN(embedding_dim, hidden_dim)
        
        self.out_embedding = nn.Embedding(self.out_vocab_size, embedding_dim, padding_idx=0)
        
        #Inputs: input, (h_0, c_0)
        #Outputs: h_1, c_1
        self.decoder = nn.LSTMCell(embedding_dim, hidden_dim) 
         
        self.enc_to_dec = nn.Linear(hidden_dim*2, hidden_dim) # project encoding outupt
        
        self.output_layer = nn.Linear(hidden_dim, self.out_vocab_size)
        
    def encode(self, x):
        x_emb = self.in_embedding(x)
        output = self.encoder(x_emb) # output: (batch_size, seq_len, hidden_dim*2)
        return output
    
    def decode(self, h_prev):
        out = self.output_layer(h_prev[0])
        probs = F.softmax(out, dim=1)
        
        return probs

<img src="attention.png">

In [39]:
class AttentionSeq2Seq(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, input_vocab, output_vocab, copying=False):
        super(AttentionSeq2Seq, self).__init__()
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.in_vocab_size = len(self.input_vocab)
        self.out_vocab_size = len(self.output_vocab)
        self.copying = copying
        
        self.in_embedding = nn.Embedding(self.in_vocab_size, embedding_dim, padding_idx=0)
        self.encoder = StackBRNN(embedding_dim, hidden_dim)
        
        self.out_embedding = nn.Embedding(self.out_vocab_size, embedding_dim, padding_idx=0)
        
        #Inputs: input, (h_0, c_0)
        #Outputs: h_1, c_1
        self.decoder = nn.LSTMCell(embedding_dim + hidden_dim*2, hidden_dim) # concatenate y_t and context_t
        
        self.enc_to_dec = nn.Linear(hidden_dim*2, hidden_dim) # project encoding outupt
        
        self.output_layer = nn.Linear(hidden_dim + hidden_dim*2, self.out_vocab_size) # concatenate h_t and context_t
        
    def encode(self, x):
        x_emb = self.in_embedding(x)
        output = self.encoder(x_emb) # output: (batch_size, seq_len, hidden_dim*2)
        return output
    
    def decode(self, encoder_outputs, encoder_proj_outputs, x_mask, h_prev):
        # (batch_size, seq_len, hidden_dim) * (batch_size, hidden_dim, 1) - >(batch_size, seq_len, 1)
        scores = torch.bmm(encoder_proj_outputs, h_prev[0].unsqueeze(2)).squeeze(2) # scores: (batch_size, seq_len)
        scores.data.masked_fill_(x_mask.data, -float('inf'))
        alpha = F.softmax(scores, dim=1)
        # (batch_size, 1, seq_len) * (batch_size, seq_len, hidden_dim) - > (batch_size, 1, hidden_dim)
        context_t = torch.bmm(alpha.unsqueeze(1), encoder_outputs).squeeze(1) # context_t: (batch_size, hidden_dim) 
        
        out = self.output_layer(torch.cat([h_prev[0], context_t], 1))
        
        if self.copying: 
            probs = F.softmax(torch.cat([out, scores], 1), dim=1) # Appending scores over the input
        else:
            probs = F.softmax(out, dim=1)
    
        return probs, context_t

#### Part 2.2: Train the model

Now we can initialize and train the network:


In [41]:
def train(ex, model, optim):
    model.train()
    
    x, x_lens, x_mask, y, y_mask, y_in_x_inds = ex
    
    # Variable(x.cuda()) if using GPU
    x, x_lens, x_mask, y, y_mask, y_in_x_inds = Variable(x), Variable(x_lens), Variable(x_mask), Variable(y), Variable(y_mask), Variable(y_in_x_inds)
    
    encoder_outputs = model.encode(x) # (batch_size, seq_len, hidden_dim*2)
    encoder_proj_outputs = model.enc_to_dec(encoder_outputs) # (batch_size, seq_len, hidden_dim)
    
    ###CHANGE: make use of x_lens to index the last hidden states
    batch_size = x.size(0)
    seq_len = x.size(1)
    h_0 = torch.index_select(encoder_proj_outputs.view(batch_size*seq_len,-1),0,x_lens) # be careful when input sequences have paddings
    
    c_0 = Variable(torch.zeros(h_0.size(0), h_0.size(1)).zero_()) 
    hidden = (h_0, c_0)
    
    p_y_seq = []
    for i in range(y.size(1)):
        #output = model.decode(hidden) 
        #y_emb = model.out_embedding(y[:, i]) # y_emb: (batch_size, embedding_dim)        
        #hidden = model.decoder(y_emb, hidden) # (h_t, c_t): (batch_size, hidden_dim)
        
        ###CHANGE: update the decode function, move the code that uses y[:, i] down
        output, context_t = model.decode(encoder_outputs, encoder_proj_outputs, x_mask, hidden) # with attention
        
        ###compute the next hidden state using the current output y[:, i]
        y_emb = model.out_embedding(y[:, i]) # y_emb: (batch_size, embedding_dim)
        hidden = model.decoder(torch.cat([y_emb, context_t], 1), hidden) 
        
        p_y_t = output.gather(1, y[:, i].unsqueeze(1)) # (batch_size, 1)
        
        if model.copying:
            copy_dist = output[:, model.out_vocab_size:model.out_vocab_size + y_in_x_inds.size(2)] # (batch_size, input_len)
            # (batch_size, 1, input_len), (batch_size, input_len, 1)
            copying_p_y_t = torch.bmm(copy_dist.unsqueeze(1), y_in_x_inds[:, i].unsqueeze(2)).squeeze(2)
            p_y_t = p_y_t + copying_p_y_t
                
        p_y_seq.append(p_y_t)

    p_y_seq = torch.cat([_ for _ in p_y_seq], 1) # (batch_size, seq_len)
    p_y_seq = torch.masked_select(p_y_seq, y_mask)
    loss = -torch.sum(torch.log(p_y_seq))/y.size(0) # loss = -\sum_i log p(y|x)

    # Clear gradients and run backward
    optim.zero_grad()
    loss.backward()

    # Clip gradients, max_norm * v/||v|| if ||v|| > max_norm
    torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=10.0)

    # Update parameters
    optim.step()
    
    return loss.data[0]

#model = Seq2Seq(50, 20, input_vocab, output_vocab)
model = AttentionSeq2Seq(50, 20, input_vocab, output_vocab, True)

optim = torch.optim.Adam(model.parameters(), lr = 0.001)

# training loop
n_epochs = 100
for e in range(n_epochs):
    train_loss = 0.0
    for ex in train_loader:
        l = train(ex, model, optim)
        train_loss += l
    print("Epoch = %d | Loss = %.2f" % (e, train_loss))

100
51
Epoch = 0 | Loss = 57.10
100
51
Epoch = 1 | Loss = 56.03
100
51
Epoch = 2 | Loss = 56.21
100
51
Epoch = 3 | Loss = 55.40
100
51
Epoch = 4 | Loss = 54.28
100
51
Epoch = 5 | Loss = 53.18
100
51
Epoch = 6 | Loss = 53.46
100
51
Epoch = 7 | Loss = 52.65
100
51
Epoch = 8 | Loss = 52.51
100
51
Epoch = 9 | Loss = 52.09
100
51
Epoch = 10 | Loss = 49.58
100
51
Epoch = 11 | Loss = 49.14
100
51
Epoch = 12 | Loss = 48.22
100
51
Epoch = 13 | Loss = 47.77
100
51
Epoch = 14 | Loss = 46.48
100
51
Epoch = 15 | Loss = 45.55
100
51
Epoch = 16 | Loss = 43.88
100
51
Epoch = 17 | Loss = 43.53
100
51
Epoch = 18 | Loss = 42.03
100
51
Epoch = 19 | Loss = 40.59
100
51
Epoch = 20 | Loss = 40.77
100
51
Epoch = 21 | Loss = 40.02
100
51
Epoch = 22 | Loss = 38.97
100
51
Epoch = 23 | Loss = 38.17
100
51
Epoch = 24 | Loss = 37.23
100
51
Epoch = 25 | Loss = 35.92
100
51
Epoch = 26 | Loss = 36.26
100
51
Epoch = 27 | Loss = 35.38
100
51
Epoch = 28 | Loss = 34.72
100
51
Epoch = 29 | Loss = 34.13
100
51
Epoch = 30 | 

#### Part 2.3: Test the model

Testing the model, similar to training. Using greedy search to infer the most likely sequence output.

In [42]:
def test_batch(data_loader, model, max_len=15):
    model.eval()
    
    num_correct = 0
    for ex in data_loader:
        x, x_lens, x_mask, y, y_mask, y_in_x_inds = ex 
        
        x, x_lens, x_mask = Variable(x), Variable(x_lens), Variable(x_mask)
    
        encoder_outputs = model.encode(x) # (batch_size, seq_len, hidden_dim*2)
        encoder_proj_outputs = model.enc_to_dec(encoder_outputs) # (batch_size, seq_len, hidden_dim)
        
        ###CHANGE: make use of x_lens to index the last hidden states
        batch_size = x.size(0)
        seq_len = x.size(1)
        h_0 = torch.index_select(encoder_proj_outputs.view(batch_size*seq_len,-1),0,x_lens) # be careful when input sequences have paddings
    
        c_0 = Variable(torch.zeros(h_0.size(0), h_0.size(1)).zero_()) 
        hidden = (h_0, c_0)
        
        ###CHANGE: start with empty prediction
        seq = []
        for i in range(max_len):
            #output = model.decode(hidden) 
            
            ###CHANGE: update the decode function, move the code that uses y[:, i] down
            output, context_t = model.decode(encoder_outputs, encoder_proj_outputs, x_mask, hidden) # with attention
        
            sampleLogprobs, it = torch.max(output.data, 1)
            y_t = it.view(-1).long()
            seq.append(y_t)
            
            if model.copying:
                new_y_t = []
                for j in range(y_t.size(0)):
                    if y_t[j] < model.out_vocab_size:
                        new_y_t.append(y_t[j])
                    else:
                        k = x.data[j, y_t[j]-model.out_vocab_size]
                        w = model.input_vocab.get_word(k)
                        new_k = model.output_vocab.get_index(w)
                        new_y_t.append(new_k)
                y_t = torch.LongTensor(new_y_t)
            
            ###compute the next hidden state using the current output y_t
            y_prev = Variable(y_t)
            y_emb = model.out_embedding(y_prev) # y_emb: (batch_size, embedding_dim)
            hidden = model.decoder(torch.cat([y_emb, context_t], 1), hidden) 
            
            #hidden = model.decoder(y_emb, hidden)
        
        pred_y = torch.cat([_.unsqueeze(1) for _ in seq], 1)
        
        for idx in range(batch_size):
            gold_toks = []
            for wi in y[idx].tolist():
                gold_toks.append(model.output_vocab.get_word(wi))
            print("Gold: ", ' '.join(gold_toks))
        
            pred_toks = []
            for wi in pred_y[idx].tolist():
                #w = model.output_vocab.get_word(wi)
            
                if wi < model.out_vocab_size:
                    w = model.output_vocab.get_word(wi)
                else:
                    w = model.input_vocab.get_word(x.data[idx][wi-model.out_vocab_size])
                    #print("copying ", w)
                    
                pred_toks.append(w)
                
            print("Predict: ",' '.join(pred_toks))
            
            for i in range(len(gold_toks)):
                g_tok = gold_toks[i]
                p_tok = pred_toks[i]
                if (g_tok != p_tok):
                    break
                elif (g_tok == "<end>"):
                    num_correct += 1
                    
                    
    print("Test accuracy: {}".format(num_correct / len(data_loader)))
                
        
test_batch(test_loader, model)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
Gold:  lm( WeatherHistory [ 'Temperature' ] , WeatherHistory [ 'Humidity' ] ) <end>
Predict:  lm( WeatherHistory [ 'Temperature' ] , WeatherHistory [ 'Temperature' ] , WeatherHistory [ 'Temperature' ]
Gold:  lm( WeatherHistory [ 'Temperature' ] , WeatherHistory [ 'Humidity' ] ) <end>
Predict:  lm( WeatherHistory [ 'Temperature' ] , WeatherHistory [ 'Temperature' ] , WeatherHistory [ 'Temperature' ]
Gold:  predict( mod , 12 ) <end>
Predict:  predict( mod , 12 ) <end> <end> , 12 ) <end> <end> , 12 )
Gold:  min( WeatherHistory [ 'Temperature' ] ) <end>
Predict:  max( WeatherHistory [ 'Temperature' ] ) <end> <end> 'Temperature' ] ) <end> <end> <end> )
Gold:  min( WeatherHistory [ 'Temperature' ] ) <end>
Predict:  max( WeatherHistory [ 'Temperature' ] ) <end> <end> 'Temperature' ] ) <end> <end> 'Temperature' ]
Gold:  mean( WeatherHistory [ 'Temperature' ] ) <end>
Predict:  mean( WeatherHistory [ 'Temperature' ] ) <end> <end> 'Temperature' ] ) 