In [12]:
import torch
import torch.nn as nn
import numpy as np
from util.utils import load_data_set, gen_batch_sequence, run_lstm, col_name_encode, create_toy_dataset
from model.word_embedding import WordEmbedding

from datetime import datetime
import matplotlib.pyplot as plt

## Class for Aggregation Prediction

In [2]:
class SelectionPredictor(nn.Module):
    def __init__(self,input_layer,hidden_size,num_layers,max_tok_num,gpu):
        super(SelectionPredictor, self).__init__()
        self.max_tok_num = max_tok_num
        self.sel_lstm = nn.LSTM(input_size=input_layer, hidden_size=hidden_size // 2,
                                num_layers=num_layers, batch_first=True,
                                dropout=0.3, bidirectional=True)
        
        self.sel_att = nn.Linear(in_features=hidden_size, out_features=1)
        self.sel_col_name_enc = nn.LSTM(input_size=input_layer, hidden_size=hidden_size // 2,
                                num_layers=num_layers, batch_first=True,
                                dropout=0.3, bidirectional=True)
        self.sel_out_K = nn.Linear(hidden_size, hidden_size)
        self.sel_out_col = nn.Linear(hidden_size, hidden_size)
        self.sel_out = nn.Sequential(nn.Tanh(), nn.Linear(hidden_size, 1))
        self.softmax = nn.Softmax()
        
        if(gpu):
            self.sel_lstm = self.sel_lstm.to('cuda')
            self.sel_col_name_enc = self.sel_col_name_enc.to('cuda')
            self.sel_att  = self.sel_att.to('cuda')
            self.softmax = self.softmax.to('cuda')
            self.sel_out_K  = self.sel_out_K.to('cuda')
            self.sel_out_col  = self.sel_out_col.to('cuda')
            self.sel_out  = self.sel_out.to('cuda')
    
    def forward(self,x_input, x_len,col_inp_var, col_name_len, col_len,col_num):
        
        B = len(x_len)
        max_x_len = max(x_len)
        
        e_col, _ = col_name_encode(col_inp_var, col_name_len, col_len, self.sel_col_name_enc)
        h_enc, _ = run_lstm(self.sel_lstm, x_input, x_len) 
       
        att_val = self.sel_att(h_enc)  
        att_val = att_val.squeeze()

        for idx, num in enumerate(x_len):
            if num < max_x_len:
                att_val[idx, num:] = -100
        
        att = self.softmax(att_val)
        K_sel = (h_enc * att.unsqueeze(2).expand_as(h_enc)).sum(1)
        K_sel_expand = K_sel.unsqueeze(1)

        sel_score = self.sel_out(self.sel_out_K(K_sel_expand) + self.sel_out_col(e_col)).squeeze()
        max_col_num = max(col_num)
        
        for idx, num in enumerate(col_num):
            if num < max_col_num:
                sel_score[idx][num:] = -100

        return sel_score

In [3]:
class Seq2SQL(nn.Module):
    def __init__(self, bert_model_name, hidden_size, num_depth ):
        super(Seq2SQL,self).__init__()
        
        self.gpu = torch.cuda.is_available()
        if(hidden_size&1!=0):
            raise ValueError('hidden size must be even, since this is a bidirectional network')
        self.hidden_size = hidden_size
        
        self.word_emb   = WordEmbedding(bert_model_name)
        self.word_emb_size = self.word_emb.bert_model.config.hidden_size
        self.selection = SelectionPredictor(input_layer=self.word_emb_size, 
                                               hidden_size=hidden_size,num_layers = num_depth, max_tok_num = 200,gpu=True)
        self.CE = nn.CrossEntropyLoss()
        if(torch.cuda.is_available):
            self.to('cuda')
    
    def forward(self,queries, col, col_num):
        x_embed, x_lengths = self.word_emb.gen_x_batch(q_batch=queries,col_batch=col)
        col_inp_var, col_name_len, col_len = self.word_emb.gen_col_batch(col)

        sel_score = self.selection(x_embed.last_hidden_state, x_lengths, col_inp_var, col_name_len, col_len, col_num)

        return (sel_score,)
    
    def loss(self, score, truth_num):
        sel_score = score[0]
        loss = 0

        sel_truth = list(map(lambda x: x[1], truth_num))
        data = torch.from_numpy(np.array(sel_truth))
        if self.gpu:
            sel_truth_var = data.cuda()
        else:
            sel_truth_var = data

        loss += self.CE(sel_score, sel_truth_var.long())

        return loss
    
    def gen_query(self, score,query_batch, col_batch, raw_query, raw_col):
        sel_score = score[0]
        B= len(query_batch)
        sel_pred = np.argmax(sel_score.data.cpu().numpy(),axis = 1)
        pred_queries = []
        for i in range(len(sel_pred)):
            pred_queries.append({'sel':sel_pred[i]})

        return pred_queries
    
    def check_accuracy(self, pred_queries, ground_truth_queries):
        tot_err = sel_err = 0
        for b, (pred_qry, ground_truth_qry) in enumerate(zip(pred_queries, ground_truth_queries)):
            good = True

            sel_pred = pred_qry['sel']
            sel_gt = ground_truth_qry['sel']
            if sel_pred != sel_gt:
                sel_err += 1
                good = False

            if good == False:
                tot_err += 1

        return np.array((sel_err)), tot_err

In [14]:
train_queries, train_tables = load_data_set('train')
val_queries, val_tables = load_data_set('dev')

train_queries, train_tables = create_toy_dataset(train_queries, train_tables,5000)

Loading train dataset
Loaded 56355 queries and 18585 tables
Loading dev dataset
Loaded 8421 queries and 2716 tables


## Function for Training the Model

In [5]:
def epoch_train(model, optimizer,batch_size, sql_queries, table_data):
    model.train()
    num_queries = len(sql_queries)
    perm = np.random.permutation(num_queries)
    cumulative_loss = 0.0
    start = 0

    while start< num_queries:
        end = start + batch_size if start + batch_size < len(perm) else len(perm)

        q_seq, col_seq, col_num, ans_seq, query_seq, ground_truth_cond_seq, raw_data = \
            gen_batch_sequence(sql_queries, table_data, perm, start, end)
        
        score = model.forward(q_seq, col_seq,col_num)
        loss = model.loss(score,ans_seq)
        cumulative_loss += loss.data.cpu().numpy() * (end - start)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        
        start = end
    return loss

In [6]:
def generate_batch_query(sql_data, idx, start, end):
    query_gt = []
    table_id = []
    for i in range(start,end):
        query_gt.append(sql_data[idx[i]]['sql'])
        table_id.append(sql_data[idx[i]]['table_id'])
    return query_gt, table_id

## Function for Finding Accuracy of Model

In [7]:
def epoch_acc(model, batch_size, sql_data, table_data, save_results = False):
    model.eval()
    perm = list(range(len(sql_data)))
    start = 0
    one_acc_num = 0.0
    tot_acc_num = 0.0
    while start < len(sql_data):
        end = start + batch_size if start + batch_size < len(perm) else len(perm)

        q_seq, col_seq, col_num, ans_seq, query_seq, ground_truth_cond_seq, raw_data =\
            gen_batch_sequence(sql_data, table_data, perm, start, end)
        
        raw_q_seq = [x[0] for x in raw_data]
        raw_col_seq = [x[1] for x in raw_data]
        
        query_gt, table_ids = generate_batch_query(sql_data, perm, start, end)
        ground_truth_sel_seq = [x[1] for x in ans_seq]
        
        score = model.forward(q_seq, col_seq, col_num)
        pred_queries = model.gen_query(score, q_seq, col_seq,
                                       raw_q_seq, raw_col_seq)
        one_err, tot_err = model.check_accuracy(pred_queries, query_gt)
        
        one_acc_num += (end - start - one_err)
        tot_acc_num += (end - start - tot_err)
        
        start = end
        
    return tot_acc_num/ len(sql_data), one_acc_num/len(sql_data)

## Training the Model

In [10]:
LEARNING_RATE = 1e-6
TRAINING_EPOCHS = 20
BATCH_SIZE = 128
    
model = Seq2SQL('bert-base-uncased',100,2)
optimizer = torch.optim.Adam(model.parameters(),lr=LEARNING_RATE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
epoch_loses = []
training_accuracies = []
i = 0;
for i in range(TRAINING_EPOCHS):
    print(f"Epoch {i+1}")

    epoch_loss = epoch_train(model, optimizer, BATCH_SIZE, train_queries, train_tables)
    epoch_loses.append(epoch_loss)

    print(f"Loss : {epoch_loss}")

    training_accuracy = epoch_acc(model, BATCH_SIZE, train_queries, train_tables)
    training_accuracies.append(training_accuracy)
    print(training_accuracy)

In [None]:
def plot_curve(x_item, y_item, item_name, dataLength, format='png'):
    
#     y = list(i.data.cpu().numpy().tolist() for i in item)
#     x = list(range(TRAINING_EPOCHS))
    if isinstance(x_item, GeneratorType):
        x_item = list(x_item)
    if isinstance(x_item, int):
        x_item = list(range(x_item))
    plt.plot(x_item, y_item)

    plt.xlabel("EPOCHS")
    plt.ylabel(item_name)

    #plt.show()
    time = datetime.now()
    day_time_str= time.strftime("%H%M%S-%d%m%Y")
    
    plt.savefig(f'./Graphs/{day_time_str}_{len(y_item)}EP_{dataLength}trainingqueries_{item_name}.{format}',dpi=300, format=format)
    plt.show()
    return 

In [None]:
lossList = list(i.data.cpu().numpy().tolist() for i in epoch_loses)
accuracy = list(x for x,y in training_accuracies)
plot_curve(TRAINING_EPOCHS, lossList,"Loss",len(train_queries),format='svg')
plot_curve(range(TRAINING_EPOCHS), accuracy,"Accuracy",len(train_queries),format='svg')