In [None]:
import re, itertools
from collections import Counter
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import BertTokenizer, BertModel
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df_train = pd.read_csv('train.csv')
df_train.head()

In [None]:
ques_len = []
sql_len = []

# populate the lists with sentence lengths
for i in df_train['question']:
      ques_len.append(len(i.split()))  

for i in df_train['sql']:
      sql_len.append(len(i.split()))

length_df = pd.DataFrame({'question':ques_len, 'sql':sql_len})

length_df.hist(bins = 20)
plt.show()

In [4]:


import unicodedata

def unicode_to_ascii(s):
    normalized = unicodedata.normalize('NFD', s)
    return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
     
def preprocess_text(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = text.rstrip().strip()
    text = '<sos> ' + text + ' <eos>'

    return text

In [5]:
x = df_train.sql
y = df_train.question

In [6]:
print('Original sentence:',x[42])
x = [preprocess_text(w) for w in x]
y = [preprocess_text(w) for w in y]
print('Preprocessed sentence:',x[42])

Original sentence: SELECT Scoring average FROM table WHERE Money list rank = n/a
Preprocessed sentence: <sos> select scoring average from table where money list rank n a <eos>


In [7]:
x = pd.DataFrame(x,columns=['question'])
y = pd.DataFrame(y,columns=['sql'])

In [8]:
df_new = pd.concat([x,y],axis=1)

In [9]:
df_new.head()

Unnamed: 0,question,sql
0,<sos> select notes from table where current sl...,<sos> tell me what the notes are for south aus...
1,<sos> select current series from table where n...,<sos> what is the current series where the new...
2,<sos> select format from table where state ter...,<sos> what is the format for south australia ?...
3,<sos> select text background colour from table...,<sos> name the background colour for the austr...
4,<sos> select count fleet series quantity from ...,<sos> how many times is the fuel propulsion is...


In [10]:
def tokenize(sentences): 
    lang_tokenizer = Tokenizer( filters='')
    lang_tokenizer.fit_on_texts(sentences)
    sequences = lang_tokenizer.texts_to_sequences(sentences)
    max_length = max(len(s) for s in sequences)
    sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

    return sequences, lang_tokenizer, max_length

In [11]:

def load_sequences(df, size=None):
    x = df.question
    y = df.sql

    x,src_lang_tokenizer,max_length_src = tokenize(x)
    y,tgt_lang_tokenizer,max_length_trg = tokenize(y)

    return x, y, src_lang_tokenizer, tgt_lang_tokenizer, max_length_src, max_length_trg


In [12]:
x, y, src_lang_tokenizer, tgt_lang_tokenizer, max_length_src, max_length_trg = load_sequences(df_new)
print("src sequences:",x.shape)
print("tgt sequences:",y.shape)
print("source maxlen:",max_length_src)
print("target maxlen:",max_length_trg)

src sequences: (56355, 62)
tgt sequences: (56355, 50)
source maxlen: 62
target maxlen: 50


In [13]:
print("Original sentence:", df_train.question[42])
print("Text after preprocessing:", preprocess_text(df_train.question[42]))
print("Text after tokenization :", x[42])

Original sentence: When the money list rank was n/a, what was the scoring average?
Text after preprocessing: <sos> when the money list rank was n a , what was the scoring average ? <eos>
Text after tokenization : [   4    3 1860  109    1    2    6  178  472   24  166   50    5    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]


In [14]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# def get_bert_embed(data):
#     embedding = tokenizer.batch_encode_plus(data,
#                 padding=True,              # Pad to the maximum sequence length
#                 truncation=True,           # Truncate to the maximum sequence length if necessary
#                 # return_tensors='pt',      # Return PyTorch tensors
#                 add_special_tokens=True    # Add special tokens CLS and SEP
#                 )
#     return embedding

# df_new.loc[:,'Encoded_Question_input_ids'] = pd.Series(get_bert_embed(df_new['question'])['input_ids'])
# df_new.loc[:,'Encoded_query_input_ids'] = pd.Series(get_bert_embed(df_new['sql'])['input_ids'])


# training_tensor_X = torch.tensor(df_new.Encoded_Question_input_ids)
# training_tensor_y = torch.tensor(df_new.Encoded_query_input_ids)

In [15]:
# training_tensor_y

In [16]:
# df_new.question[0]

In [17]:
# encoded_df = df_new.iloc[:,2:]
# # encoded_df['Encoded_query_input_ids'][0]
# encoded_df

In [18]:
# tokenizer.decode(encoded_df.Encoded_Question_input_ids[56354], skip_special_tokens=True)

In [19]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

print ("Input Language; index to word mapping")
convert(src_lang_tokenizer, x[42])
print ()
print ("Target Language; index to word mapping")
convert(tgt_lang_tokenizer, y[42])

Input Language; index to word mapping
4 ----> <sos>
3 ----> select
1860 ----> scoring
109 ----> average
1 ----> from
2 ----> table
6 ----> where
178 ----> money
472 ----> list
24 ----> rank
166 ----> n
50 ----> a
5 ----> <eos>

Target Language; index to word mapping
2 ----> <sos>
13 ----> when
1 ----> the
264 ----> money
276 ----> list
51 ----> rank
10 ----> was
218 ----> n
9 ----> a
8 ----> ,
5 ----> what
10 ----> was
1 ----> the
1026 ----> scoring
47 ----> average
4 ----> ?
3 ----> <eos>


In [20]:

src_vocab_size = len(src_lang_tokenizer.word_index)+1 
tgt_vocab_size = len(tgt_lang_tokenizer.word_index)+1 
print(src_vocab_size)
print(tgt_vocab_size)

23971
25515


In [21]:
X_train,X_test,y_train,y_test = train_test_split(x, y, shuffle=False, test_size=0.2)
print(f"{len(X_train)=},{len(X_test)=},{len(y_train)=},{len(y_test)=}")

len(X_train)=45084,len(X_test)=11271,len(y_train)=45084,len(y_test)=11271


In [22]:
print(X_train[:1])
print()
print(X_test[:1])
print()
print(y_train[:1])
print()
print(y_test[:1])

[[    4     3   192     1     2     6   305 11119   142   339     5     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0]]

[[    4     3    15  7383  7384     1     2     6   864    94     7    18
  21209 21210     5     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0]]

[[  2  91  87   5   1 288  43  12 180 401   3   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]

[[    2    19  77

In [23]:
#Defining hyperparameters
buffer_size=len(X_train)
val_buffer_size = len(X_test)
BATCH_SIZE = 64
embedding_dim = 128
units = 1024 
steps_per_epoch = buffer_size//BATCH_SIZE
val_steps_per_epoch = val_buffer_size//BATCH_SIZE

In [24]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

train_dataset = train_dataset.shuffle(buffer_size=buffer_size).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

val_dataset = val_dataset.batch(BATCH_SIZE)

In [25]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 62]), TensorShape([64, 50]))

In [26]:
class Encoder(tf.keras.Model):

    def __init__(self, vocab_size, emb_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.enc_units = enc_units 
        self.batch_sz = batch_sz 
        self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim,mask_zero=True)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform') 

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state 

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [27]:
encoder = Encoder(src_vocab_size, embedding_dim, units, BATCH_SIZE) 

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 62, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [28]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units) # fully-connected dense layer-1
    self.W2 = tf.keras.layers.Dense(units) # fully-connected dense layer-2
    self.V = tf.keras.layers.Dense(1) # fully-connected dense layer-3

  def call(self, query, values):
   
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))   
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [29]:
attention_layer = BahdanauAttention(20) 
attention_result, attention_weights = attention_layer(sample_hidden, sample_output) 

print("Attention result shape (context vector): (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape (context vector): (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 62, 1)


In [30]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, emb_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz 
    self.dec_units = dec_units 
    self.attention = BahdanauAttention(self.dec_units)
    
    self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim) 
    
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform') 
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state , attention_weights

In [31]:
decoder = Decoder(tgt_vocab_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 25515)


In [32]:
import os
optimizer = tf.keras.optimizers.Adam() 

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')  



def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))  
  loss_ = loss_object(real, pred)  

  mask = tf.cast(mask, dtype=loss_.dtype) 
  loss_ *= mask

  return tf.reduce_mean(loss_)


checkpoint_dir = './training_checkpoints'  
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")  
checkpoint = tf.train.Checkpoint(optimizer=optimizer,  
                                 encoder=encoder,
                                 decoder=decoder)

In [33]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden) 

    dec_hidden = enc_hidden 

    dec_input = tf.expand_dims([tgt_lang_tokenizer.word_index['<sos>']] * BATCH_SIZE, 1) 

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) 

      loss += loss_function(targ[:, t], predictions) 

      dec_input = tf.expand_dims(targ[:, t], 1) 

  batch_loss = (loss / int(targ.shape[1])) 

  variables = encoder.trainable_variables + decoder.trainable_variables 

  gradients = tape.gradient(loss, variables) 
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [34]:
@tf.function
def val_step(inp, targ, enc_hidden):
    loss = 0 
    enc_output, enc_hidden = encoder(inp, enc_hidden) 
    dec_hidden = enc_hidden
    dec_input =  tf.expand_dims([tgt_lang_tokenizer.word_index['<sos>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]): 
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) 
        loss += loss_function(targ[:, t], predictions) 
        dec_input = tf.expand_dims(targ[:, t], 1) 

    batch_loss = (loss / int(targ.shape[1])) 

    return batch_loss 

In [35]:
import time

def train_and_validate(train_dataset, val_dataset, EPOCHS=10):
    for epoch in range(EPOCHS):
        start = time.time()

        #Step1: 
        enc_hidden = encoder.initialize_hidden_state()
        total_train_loss = 0
        total_val_loss = 0
        for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden)
            total_train_loss += batch_loss 

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                            batch,
                                                            batch_loss.numpy()))
       
        for (batch, (inp, targ)) in enumerate(val_dataset.take(val_steps_per_epoch)):    
            val_batch_loss = val_step(inp, targ, enc_hidden) 
            total_val_loss += val_batch_loss 

        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
        
        print('Total training loss is {:.4f}'.format(total_train_loss / steps_per_epoch))
        print('Total validation loss is {:.4f}'.format( total_val_loss / val_steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
train_and_validate(train_dataset, val_dataset)

In [1]:
import os
import tensorflow as tf

optimizer = tf.keras.optimizers.Adam() 
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')  

checkpoint_dir = './training_checkpoints'  
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")  
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

@tf.function
def train_step(inp, targ, enc_hidden, encoder, decoder):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden) 
        dec_hidden = enc_hidden 
        dec_input = tf.expand_dims([tgt_lang_tokenizer.word_index['<sos>']] * BATCH_SIZE, 1) 
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) 
            loss += loss_function(targ[:, t], predictions) 
            dec_input = tf.expand_dims(targ[:, t], 1) 
    batch_loss = (loss / int(targ.shape[1])) 
    variables = encoder.trainable_variables + decoder.trainable_variables 
    gradients = tape.gradient(loss, variables) 
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

@tf.function
def val_step(inp, targ, enc_hidden, encoder, decoder):
    loss = 0 
    enc_output, enc_hidden = encoder(inp, enc_hidden) 
    dec_hidden = enc_hidden
    dec_input =  tf.expand_dims([tgt_lang_tokenizer.word_index['<sos>']] * BATCH_SIZE, 1)
    for t in range(1, targ.shape[1]): 
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) 
        loss += loss_function(targ[:, t], predictions) 
        dec_input = tf.expand_dims(targ[:, t], 1) 
    batch_loss = (loss / int(targ.shape[1])) 
    return batch_loss 

import time

def train_and_validate(train_dataset, val_dataset, EPOCHS=10):
    for epoch in range(EPOCHS):
        start = time.time()

        enc_hidden = encoder.initialize_hidden_state()
        total_train_loss = 0
        total_val_loss = 0

        for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden, encoder, decoder)
            total_train_loss += batch_loss 
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))

        for (batch, (inp, targ)) in enumerate(val_dataset.take(val_steps_per_epoch)):    
            val_batch_loss = val_step(inp, targ, enc_hidden, encoder, decoder) 
            total_val_loss += val_batch_loss 

        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)

        print('Total training loss is {:.4f}'.format(total_train_loss / steps_per_epoch))
        print('Total validation loss is {:.4f}'.format(total_val_loss / val_steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


NameError: name 'train_dataset' is not defined