In [1]:
import torch
import torch.nn as nn
import numpy as np
from utils.utils import load_data_set, gen_batch_sequence
from model.wordEmbedding import WordEmbedding

from tqdm import tqdm

In [2]:
train_query, train_table = load_data_set('train')

Loading train dataset
Loaded 56355 queries and 18585 tables


In [3]:
selected_query_id = [34,56,12,43] #random
ret_tup = gen_batch_sequence(train_query,train_table,selected_query_id, 0, len(selected_query_id))

In [4]:
batch_query = ret_tup[0]
batch_table = ret_tup[1]
gt_quety = ret_tup[4]
print(batch_query)

[['during', 'which', 'years', 'was', 'marcus', 'banks', 'in', 'toronto', '?'], ['what', 'is', 'the', 'canton', 'of', 'grande', 'dixence', '?'], ['what', 'school', 'did', 'player', 'number', '6', 'come', 'from', '?'], ['what', 'time', 'was', 'the', 'highest', 'for', '2nd', 'finishers', '?']]


In [5]:
word_emb = WordEmbedding('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
print(word_emb.bert_model.config.hidden_size)

768


In [7]:
q_seq, col_seq, col_num, ans_seq, query_seq, ground_truth_cond_seq, raw_data = \
        gen_batch_sequence(train_query, train_table,selected_query_id,0,len(selected_query_id))
print(raw_data)

[('During which years was Marcus Banks in Toronto?', ['Player', 'No.', 'Nationality', 'Position', 'Years in Toronto', 'School/Club Team'], 'SELECT years in toronto FROM table_ WHERE player EQL marcus banks'), ('What is the canton of grande dixence?', ['Name', 'Canton', 'Height (meters)', 'Crest length (meters)', 'Type', 'Year of construction', 'Name of the Lake'], 'SELECT canton FROM table_ WHERE name EQL grande dixence'), ('What school did player number 6 come from?', ['Player', 'No.', 'Nationality', 'Position', 'Years in Toronto', 'School/Club Team'], 'SELECT school/club team FROM table_ WHERE no . EQL 6'), ('What time was the highest for 2nd finishers?', ['Year', 'Tournaments played', 'Cuts made*', 'Wins', '2nd', 'Top 10s', 'Best finish', 'Earnings ($)', 'Money list rank', 'Scoring average', 'Scoring rank'], 'SELECT max ( 2nd ) FROM table_ WHERE')]


## Function for running LSTM 

In [8]:
bert_op, input_lens = word_emb.gen_x_batch(batch_query, batch_table)
x_emb = bert_op.last_hidden_state
x_emb.shape

torch.Size([4, 47, 768])

In [9]:
bert_op.pooler_output.size()

torch.Size([4, 768])

In [10]:
lstm = nn.LSTM(768,hidden_size = 100//2, num_layers = 4,bidirectional=True)

lstm.to("cuda")

LSTM(768, 50, num_layers=4, bidirectional=True)

In [11]:
h_n, (c_fwd, c_rev) = lstm(x_emb)

In [12]:
print(f"h_n: {h_n.shape}\nc_wd: {c_fwd.shape}\nc_rev: {c_rev.shape}")

h_n: torch.Size([4, 47, 100])
c_wd: torch.Size([8, 47, 50])
c_rev: torch.Size([8, 47, 50])


In [13]:
#  in the code for the reference paper they sorted the inputs (acc. to the size)
# TODO: Clean this function: 
def run_lstm(lstm, inp, inp_length, prev_hidden=None):
    '''
    Input: This function takes in 3 arguments 
        lstm : the name of the lstm variable that needs to be run
        inp  : the input in the for [Batch size , num_tok, last_layer]
        inp_length: an array that contains the length of each element in the batch size = batch size
        pre_hidden: hidden layer values of the previous lstm layer
    
    Ouptut: 
        Same as nn.LSTM
    '''
    ret_h, ret_c = lstm(inp,prev_hidden)
    return ret_h, ret_c

## Attention layer

In [14]:
x = torch.tensor([[1,2,3,4,5,6]])
x = x.squeeze()
print(x.shape)


torch.Size([6])


In [15]:
x= torch.tensor([[[1,2,3,4],
                  [5,6,7,8]],
                [[9,10,11,12],
                 [13,14,15,16]],
                [[9,10,11,12],
                 [13,14,15,16]]
                ]).to('cuda')
x.shape

torch.Size([3, 2, 4])

In [16]:
num_hidden = 100

### Scalar attention score

In [17]:
scalar_attention = nn.Linear(num_hidden, 1).to("cuda")

In [18]:
att_val = scalar_attention(h_n)
att_val = att_val.squeeze()
print(att_val.shape)

torch.Size([4, 47])


In [19]:
max_x_len = max(input_lens)
for idx, num in enumerate(input_lens): # reduce the importance of 0 values
    if num < max_x_len:
        att_val[idx,num:] = -100

In [20]:
softmax = nn.Softmax(dim = 1) # Probability distribution for the attention values
att = softmax(att_val)
print(att.shape)

for x in att:
    assert int(x.sum().ceil().tolist()) == 1 

torch.Size([4, 47])


In [21]:
print(h_n.shape)
att_matrix = att.unsqueeze(2).expand_as(h_n)
print(att_matrix.shape)
K_agg = (h_n*att_matrix).sum(1)
print(K_agg.shape)

torch.Size([4, 47, 100])
torch.Size([4, 47, 100])
torch.Size([4, 100])


In [22]:
print(att_matrix[0].shape)

torch.Size([47, 100])


In [23]:
agg_out = nn.Sequential(
            nn.Linear(num_hidden,num_hidden),
            nn.Tanh(),
            nn.Linear(num_hidden, 6)
).to('cuda')

In [24]:
agg_score = agg_out(K_agg)
print(agg_score)

tensor([[ 0.0924, -0.0693, -0.1062, -0.1177,  0.0166, -0.0138],
        [ 0.0932, -0.0712, -0.1023, -0.1223,  0.0198, -0.0156],
        [ 0.0926, -0.0736, -0.0985, -0.1263,  0.0206, -0.0199],
        [ 0.0901, -0.0777, -0.0945, -0.1282,  0.0180, -0.0265]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


## Class for aggregation predictor

In [52]:
class AggregationPredictor(nn.Module):
    def __init__(self,input_layer,hidden_size,num_layers,gpu):
        super(AggregationPredictor,self).__init__()
        self.agg_lstm = nn.LSTM(input_size=input_layer, hidden_size=hidden_size // 2,
                                num_layers=num_layers, batch_first=True,
                                dropout=0.3, bidirectional=True)
        
        self.agg_att = nn.Linear(in_features=hidden_size, out_features=1)
        self.soft_max = nn.softmax()
        self.agg_out = nn.Sequential(
            nn.Linear(input_size = hidden_size, out_features=hidden_size),
            nn.Tanh(),
            nn.Linear(input_size , hidden_size, 6)
        )
        
        if(gpu):
            self.agg_lstm = self.agg_lstm.to('cuda')
            self.agg_att  = self.agg_att.to('cuda')
            self.soft_max = self.soft_max.to('cuda')
            self.agg_out  = self.agg_out.to('cuda')
    
    def forward(x_input, x_len):
        B = x_input.shape[0]
        max_len = max(x_len)
        h_n, _ = run_lstm(agg.lstm,agg.lstm.shape[1]) # [B * longestinput * hidden_size] 
        
        #calculate the scalar attention score. [scalar, since one value for each input word.]
        att_val = self.agg_att(h_n)  #[B * longest_input * 1]
        att_val = att_val.squeeze()  #[B* longest_input]
        
        for index, l in x_len:
            if(l<max_len):
                att_val = att_val[index][l:] = -100
        
        att_prob_dist = self.soft_max(att_val) #[B * longest_input]
        att_prob_dist = att_prob_dist.unsqueeze(2).expand_as(h_n) #[B * longest_input * hidden_size]
        K_agg = (h_n* att_prob_dist).sum(1)
        agg_score = self.agg_out(K_agg)
        return agg_score
        
        

In [50]:
class Seq2SQL(nn.Module):
    def __init__(self, bert_model_name, hidden_size, num_depth ):
        super(Seq2SQL,self).__init__()
        
        self.gpu = torch.cuda.is_available()
        if(hidden_size&1!=0):
            raise ValueError('hidden size must be even, since this is a bidirectional network')
        self.hidden_size = hidden_size
        
        self.word_emb   = WordEmbedding(bert_model_name)
        self.word_emb_size = self.word_emb.bert_model.config.hidden_size
        self.aggregator = AggregationPredictor(input_layer=self.word_emb_size, 
                                              hidden_size=hidden_size,num_layers = num_depth,gpu=True)
        self.CE = nn.CrossEntropyLoss()
        if(torch.cuda.is_available):
            self.to('cuda')
        
    def forward(self,queries, col):
        x_embed, x_lengths = self.word_emb.gen_x_batch(q_batch=queries,col_batch=col)
        
        agg_score = self.word_emb(x_embed, x_lengths)
        
        return (agg_score,)
    
    def loss(self, score):
        agg_score = score[0]
        loss = 0
        
        loss += self.CE(agg_score)
        return loss   
    
    def gen_query(self, score,query_batch, col_batch, raw_query, raw_col):
        agg_score = score
        B= len(query_batch)
        for b in range(B):
            print(score)
            

In [40]:
train_queries, train_tables = load_data_set('train')
val_queries, val_tables = load_data_set('dev')

Loading train dataset
Loaded 56355 queries and 18585 tables
Loading dev dataset
Loaded 8421 queries and 2716 tables


In [41]:
def epoch_train(model, optimizer,batch_size, sql_queries, table_data):
    model.train()
    num_queries = len(sql_queries)
    perm = np.random.permutation(num_queries)
    cumulative_loss = 0.0
    start = 0

    while start< num_queries:
        end = start + batch_size if start + batch_size < len(perm) else len(perm)

        q_seq, col_seq, col_num, ans_seq, query_seq, ground_truth_cond_seq, raw_data = \
            generate_batch_sequence(sql_data, table_data, perm, start, end)
        
        score = model(q_seq, col_seq)
        loss = model.loss(score)
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        
        start = end
    return loss

In [42]:
def generate_batch_query(sql_data, idxes, start, end):
    query_gt = []
    table_ids = []
    for i in range(start, end):
        query_gt.append(sql_data[idxes[i]]['sql'])
        table_ids.append(sql_data[idxes[i]]['table_id'])
    return query_gt, table_ids

In [43]:
print(ans_seq[0])

(0, 4, 1, (0,), (0,))


In [44]:
def generate_batch_query(sql_data, idx, start, end):
    query_gt = []
    table_id = []
    for i in range(start,end):
        query_gt.append(sql_data[idx[i]]['sql'])
        table_id.append(sql_data[idx[i]]['table_id'])
    return query_gt, table_id

In [45]:
def epoch_acc(model, batch_size, sql_data, table_data, save_results = False):
    model.eval()
    perm = list(range(len(sql_data)))
    start = 0
    one_acc_num = 0.0
    while start < len(sql_data):
        end = start + batch_size if start + batch_size < len(perm) else len(perm)

        q_seq, col_seq, col_num, ans_seq, query_seq, ground_truth_cond_seq, raw_data =\
            generate_batch_sequence(sql_data, table_data, perm, start, end)
        
        raw_q_seq = [x[0] for x in raw_data]
        raw_col_seq = [x[1] for x in raw_data]
        
        query_gt, table_ids = generate_batch_query(sql_data, perm, start, end)
        ground_truth_sel_seq = [x[1] for x in ans_seq]
        
        score = model.forward(q_seq, col_seq, col_num, ground_truth_sel=ground_truth_sel_seq)
        pred_queries = model.gen_query(score, q_seq, col_seq,
                                       raw_q_seq, raw_col_seq)
        one_err, tot_err = model.check_accuracy(pred_queries, query_gt)
        
#         if save_results:
#             model.save_readable_results(pred_queries, query_gt, table_ids, table_data)

        one_acc_num += (end - start - one_err)
        tot_acc_num += (end - start - tot_err)

        start = end
    return tot_acc_num / len(sql_data), one_acc_num / len(sql_data)

In [53]:
LEARNING_RATE = 1e-5
TRAINING_EPOCHS = 100
BATCH_SIZE = 64

model = Seq2SQL('bert-base-uncased',100,2)
optimizer = torch.optim.Adam(model.parameters(),lr=LEARNING_RATE)

epoch_loses = []
i = 0;
#for i in range(TRAINING_EPOCHS):
print(f"Epoch {i+1}")

epoch_loss = epoch_train(model, optimizer, BATCH_SIZE, train_queries, train_tables)
epoch_loses.append(epoch_loss)

print(f"Loss : {epoch_loss}")

training_accuracy = epoch_acc(model, BATCH_SIZE, sql_data, table_data)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: module 'torch.nn' has no attribute 'softmax'