In [1]:
import pandas as pd
import string

In [2]:
chars = [i for i in string.ascii_uppercase + string.ascii_lowercase + string.digits + string.punctuation + ' ' + '\t' + '\n']

In [3]:
len(chars)

97

In [4]:
chars[:5]

['A', 'B', 'C', 'D', 'E']

In [5]:
chars_pos = lambda c: chars.index(c)

In [6]:
samples = pd.read_csv('data/dataset_random.csv', header=None)

In [7]:
num_samples = len(samples)

In [8]:
max_len_ptbr = len(max(samples[1][1:], key=len))
max_len_sql = len(max(samples[1][1:], key=len))

In [9]:
ptbr = samples[1][1:].tolist()
sql = samples[2][1:].tolist()

In [10]:
import numpy as np

In [11]:
ptbr_tokenized = np.zeros((num_samples, max_len_ptbr, len(chars)))
sql_tokenized = np.zeros((num_samples, max_len_sql, len(chars)))
target = np.zeros((num_samples, max_len_sql, len(chars)))

In [12]:
for i in range(len(samples)-1):
    for j, ch in enumerate(ptbr[i]):
        ptbr_tokenized[i, j, chars_pos(ch)] = 1
    
    for j, ch in enumerate(sql[i]):
        sql_tokenized[i, j, chars_pos(ch)] = 1
        
        if j > 0:
            target[i, j-1, chars_pos(ch)] = 1

In [13]:
import keras

from keras.layers import Input, LSTM, Dense
from keras.models import Model

Using TensorFlow backend.


In [14]:
e_input = Input((None, len(chars)))
e_lstm = LSTM(256, return_state=True)
e_output, h, c = e_lstm(e_input)
e_states = [h, c]
d_input = Input((None, len(chars)))
d_lstm = LSTM(256, return_sequences=True, return_state=True)
d_output, _, _ = d_lstm(d_input, initial_state=e_states)
d_dense = Dense(len(chars), activation='softmax')
d_output = d_dense(d_output)

In [15]:
model = Model(inputs=[e_input, d_input], outputs=[d_output])
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [16]:
model.fit(x=[ptbr_tokenized, sql_tokenized], y=target, batch_size=64, epochs=50, validation_split=0.2)

Train on 19200 samples, validate on 4801 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0d96680940>

In [17]:
model.save('models/ptbr2sql_without_attention.h5')

  '. They will not be included '


In [18]:
e_inference_model = Model(e_input, e_states)

In [19]:
d_state_h = Input((256,))
d_state_c = Input((256,))
d_input_states = [d_state_h, d_state_c]

In [20]:
d_output, d_h, d_c = d_lstm(d_input, initial_state=d_input_states)
d_states = [d_h, d_c]
d_output = d_dense(d_output)

In [21]:
d_inference_model = Model(inputs=[d_input] + d_input_states, outputs=[d_output] + d_states)

In [22]:
def decode(s):
    states = e_inference_model.predict(s)
    
    target = np.zeros((1, 1, len(chars)))
    target[0, 0, chars_pos('\t')] = 1
    
    sql = ''
    stop = False
    
    while not stop:
        d_out, d_h, d_c = d_inference_model.predict(x=[target] + states)
        
        max_index = np.argmax(d_out[0, -1, :])
        sampled = chars[max_index]
        sql += str(sampled)
        
        if(sampled == '\n' or (len(sql) > max_len_sql)):
            stop = True
            
        target = np.zeros((1, 1, len(chars)))
        target[0, 0, max_index] = 1
        
        states = [d_h, d_c]
    
    return sql

In [24]:
for t in range(0, 20):
    s = ptbr_tokenized[t]
    s = np.expand_dims(s, axis=0)
    
    print(s.shape)
    
    translated = decode(s)
    
    print("Input:", ptbr[t])
    print("Output:", translated)

(1, 100, 97)
Input: traga tudo da tabela x1v3NLJGBFq1F6k
Output: select * from DzgggQ3GQlhhcZ

(1, 100, 97)
Input: traga tudo da tabela 1fnR
Output: select * from fU3l

(1, 100, 97)
Input: traga tudo da tabela EakxSAl
Output: select * from USUYlx

(1, 100, 97)
Input: traga tudo da tabela Hlt11CGmim
Output: select * from kptQm3Q3xa

(1, 100, 97)
Input: traga tudo da tabela RpKjOXdT9ln20q
Output: select * from krzJtg1Ohfx5Mx

(1, 100, 97)
Input: traga tudo da tabela cLZ71zvw
Output: select * from 4oEHvm3aa

(1, 100, 97)
Input: traga tudo da tabela YRFGeqn7ESC9Ue6
Output: select * from YFgggggggMNFnN

(1, 100, 97)
Input: traga tudo da tabela 1oOlRWvQxjhM
Output: select * from 4Or7Xmz8otA3

(1, 100, 97)
Input: traga tudo da tabela 7tnWRpC3no
Output: select * from YFgggglNB

(1, 100, 97)
Input: traga tudo da tabela o9C9ui
Output: select * from DFji7d

(1, 100, 97)
Input: traga tudo da tabela o
Output: select * from o

(1, 100, 97)
Input: traga tudo da tabela yuhKGT9
Output: select * from kK