In [67]:
import torch
import torch.nn as nn
import numpy as np
from utils.utils import load_data_set, gen_batch_sequence
from model.wordEmbedding import WordEmbedding

In [2]:
train_query, train_table = load_data_set('train')

Loading train dataset
Loaded 56355 queries and 18585 tables


In [3]:
selected_query_id = [34,56,12,43] #random
ret_tup = gen_batch_sequence(train_query,train_table,selected_query_id, 0, len(selected_query_id))

In [4]:
batch_query = ret_tup[0]
batch_table = ret_tup[1]
gt_quety = ret_tup[4]
print(batch_query)

[['during', 'which', 'years', 'was', 'marcus', 'banks', 'in', 'toronto', '?'], ['what', 'is', 'the', 'canton', 'of', 'grande', 'dixence', '?'], ['what', 'school', 'did', 'player', 'number', '6', 'come', 'from', '?'], ['what', 'time', 'was', 'the', 'highest', 'for', '2nd', 'finishers', '?']]


In [5]:
word_emb = WordEmbedding('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Function for running LSTM 

In [6]:
bert_op, input_lens = word_emb.gen_x_batch(batch_query, batch_table)
x_emb = bert_op.last_hidden_state
x_emb.shape

torch.Size([4, 47, 768])

In [7]:
bert_op.pooler_output.size()

torch.Size([4, 768])

In [8]:
nn.LSTM?

In [9]:
lstm = nn.LSTM(768,hidden_size = 100//2, num_layers = 4,bidirectional=True)

lstm.to("cuda")

LSTM(768, 50, num_layers=4, bidirectional=True)

In [10]:
h_n, (c_fwd, c_rev) = lstm(x_emb)

In [11]:
print(f"h_n: {h_n.shape}\nc_wd: {c_fwd.shape}\nc_rev: {c_rev.shape}")

h_n: torch.Size([4, 47, 100])
c_wd: torch.Size([8, 47, 50])
c_rev: torch.Size([8, 47, 50])


In [12]:
#  in the code for the reference paper they sorted the inputs (acc. to the size)
# TODO: Clean this function: 
def run_lstm(lstm, inp, inp_length, prev_hidden=None):
    '''
    Input: This function takes in 3 arguments 
        lstm : the name of the lstm variable that needs to be run
        inp  : the input in the for [Batch size , num_tok, last_layer]
        inp_length: an array that contains the length of each element in the batch size = batch size
        pre_hidden: hidden layer values of the previous lstm layer
    
    Ouptut: 
        Same as nn.LSTM
    '''
    ret_h, ret_c = lstm(inp,prev_hidden)
    return ret_h, ret_c

## Attention layer

In [49]:
x = torch.tensor([[1,2,3,4,5,6]])
x = x.squeeze()
print(x.shape)


torch.Size([6])


In [44]:
x= torch.tensor([[[1,2,3,4],
                  [5,6,7,8]],
                [[9,10,11,12],
                 [13,14,15,16]],
                [[9,10,11,12],
                 [13,14,15,16]]
                ]).to('cuda')
x.shape

torch.Size([3, 2, 4])

In [35]:
num_hidden = 100

### Scalar attention score

In [13]:
scalar_attention = nn.Linear(num_hidden, 1).to("cuda")

In [54]:
att_val = scalar_attention(h_n)
att_val = att_val.squeeze()
print(att_val.shape)

torch.Size([4, 47])


In [55]:
max_x_len = max(input_lens)
for idx, num in enumerate(input_lens): # reduce the importance of 0 values
    if num < max_x_len:
        att_val[idx,num:] = -100

In [82]:
softmax = nn.Softmax(dim = 1) # Probability distribution for the attention values
att = softmax(att_val)
print(att.shape)

for x in att:
    assert int(x.sum().ceil().tolist()) == 1 

torch.Size([4, 47])


In [140]:
print(h_n.shape)
att_matrix = att.unsqueeze(2).expand_as(h_n)
print(att_matrix.shape)
K_agg = (h_n*att_matrix).sum(1)
print(K_agg.shape)

torch.Size([4, 47, 100])
torch.Size([4, 47, 100])
torch.Size([4, 100])


In [150]:
print(att_matrix[0].shape)

torch.Size([47, 100])


In [83]:
agg_out = nn.Sequential(
            nn.Linear(num_hidden,num_hidden),
            nn.Tanh(),
            nn.Linear(num_hidden, 6)
).to('cuda')

In [126]:
agg_score = agg_out(K_agg)
print(agg_score)

tensor([[-0.0695,  0.1001,  0.1088,  0.0280,  0.0561, -0.0524],
        [-0.0667,  0.0973,  0.1071,  0.0263,  0.0583, -0.0472],
        [-0.0653,  0.0935,  0.1049,  0.0239,  0.0583, -0.0464],
        [-0.0636,  0.0896,  0.1036,  0.0177,  0.0548, -0.0475]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


## Class for aggregation predictor

In [151]:
class AggregationPredictor(nn.Module):
    def __init__(self,input_layer,hidden_size,num_layers,gpu):
        
        self.agg_lstm = nn.LSTM(input_size=input_layer, hidden_size=hidden_size // 2,
                                num_layers=num_layers, batch_first=True,
                                dropout=0.3, bidirectional=True)
        
        self.agg_att = nn.Linear(in_features=hidden_size, out_features=1)
        self.soft_max = nn.softmax()
        self.agg_out = nn.Sequential(
            nn.Linear(input_size = hidden_size, out_features=hidden_size),
            nn.Tanh(),
            nn.Linear(input_size , hidden_size, 6)
        )
        
        if(gpu):
            self.agg_lstm = self.agg_lstm.to('cuda')
            self.agg_att  = self.agg_att.to('cuda')
            self.soft_max = self.soft_max.to('cuda')
            self.agg_out  = self.agg_out.to('cuda')
    
    def forward(x_input, x_len):
        B = x_input.shape[0]
        max_len = max(x_len)
        h_n, _ = run_lstm(agg.lstm,agg.lstm.shape[1]) # [B * longestinput * hidden_size] 
        #calculate the scalar attention score. [scalar, since one value for each input word.]
        att_val = self.agg_att(h_n)  #[B * longest_input * 1]
        att_val = att_val.squeeze()  #[B* longest_input]
        
        for index, l in x_len:
            if(l<max_len):
                att_val = att_val[index][l:] = -100
        
        att_prob_dist = self.soft_max(att_val) #[B * longest_input]
        att_prob_dist = att_prob_dist.unsqueeze(2).expand_as(h_n) #[B * longest_input * hidden_size]
        K_agg = (h_n* att_prob_dist).sum(1)
        agg_score = self.agg_out(K_agg)
        return agg_score
        
        

In [None]:
class Seq2SQL(nn.Module):
    def __init__(self, bert_model_name, hidden_size, num_depth ):
        super(Seq2SQL,self).__init__()
        
        self.gpu = torch.cuda.is_available()
        if(hidden_size&1!=0):
            raise ValueError('hidden size must be even, since this is a bidirectional network')
        self.hidden_size = hidden_size
        
        self.aggregator = AggregationPredictor()
        