<a href="https://colab.research.google.com/github/sambitghsh/Projects/blob/main/my_final_sol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mounting the google drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import time
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import re
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu


In [3]:
#creating the vocabulary
class vocabulary:
    def __init__(self, name):
        #PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, OOV_TOKEN = 0, 1, 2
        self.name = name
        self.token2index = {"PAD":0, "SOS":1, "EOS":2, "OOV":3}
        self.token2count = {}
        self.index2token = {0: "PAD", 1: "SOS", 2: "EOS", 3: "OOV"}
        self.num_tokens = 4
        self.num_tokenline = 0
        self.longest_token = 0
        self.source_text = []
        self.target_text = []
        self.source_normal_map = {}
        self.target_normal_map = {}

        
    def add_token(self, source_token, target_token):
        i = 0
        for token in source_token:
            token = token.strip()
            
            if token not in self.token2index:
                self.token2index[token] = self.num_tokens
                self.token2count[token] = 1
                self.index2token[self.num_tokens] = token
                self.num_tokens += 1
            else:
                self.token2count[token] += 1
                
        for token in target_token:
            token = token.strip()
            if token not in self.token2index:
                self.token2index[token] = self.num_tokens
                self.token2count[token] = 1
                self.index2token[self.num_tokens] = token
                self.num_tokens += 1
            else:
                self.token2count[token] += 1
                
             
    def add_source_line_token(self, source_line, target_line):
        source_line = source_line.to_numpy().reshape(-1, 1)
        target_line = target_line.to_numpy().reshape(-1, 1)
        
        for i in range(source_line.shape[0]):
            source_token, target_token = source_line[i], target_line[i]
            self.target_text.append(target_token[0])
            self.source_text.append(source_token[0])
            source_token = eval(str(source_token[0]))
            target_token = eval(str(target_token[0]))
            #source_token, self.source_normal_map[i] = rename_ids(source_token)
            #target_token, self.target_normal_map[i] = rename_ids(target_token)
            #self.target_text.append(target_token)
            #self.source_text.append(source_token)
            self.add_token(source_token, target_token)
            
        return self.token2index, self.token2count, self.index2token, self.num_tokens, self.source_text, self.target_text, self.source_normal_map, self.target_normal_map
      
    def to_word(self, index):
        return self.index2token[index]

    def to_index(self, token):
        return self.token2index[token.strip()]

In [4]:
def rename_ids(token):
    identifier = re.compile(r"^[^\d\W]\w*\Z", re.UNICODE)
    keywords = ['auto','double', 'int', 'struct','break', 'else', 'long', 'switch', 'case', 'enum', 
                'register', 'typedef', 'char', 'extern', 'union', 'continue', 'for', 'signed', 'void',
                'do', 'if', 'static', 'while', 'default', 'goto', 'sizeof', 'volatile', 'const', 'float',
                'short', 'unsigned', 'char', 'unsigned char', 'signed char', 'short', 'signed short', 'unsigned short',
                'int', 'signed int', 'unsigned int', 'short int', 'signed short int', 'unsigned short int', 'long int',
                'signed long int', 'unsigned long int', 'float', 'double', 'long double', 'return']
    
    length = len(token)
    
    replaced = [0]*length
    toValMap = {}
    k = 1
    for i in range(length):
        fun, isInt, isStr = 1, False, False
        
        #to check a valid identifier
        result = token[i].isidentifier()
        
        #to check not a valid keyword if it is a valid identifier
        flag = token[i] not in keywords 
        
        #to check not a function if not a valid identifier and a keyword
        if i != length - 1:             
            if token[i+1] == '(':
                fun = 0
        
        #to check a valid numeric literal
        try:
            s = float(token[i])
            isInt = True
        except ValueError:
            isInt = False
            
        #to check a valid string literal, and not a format specifier
        #not start with a %, and contains an inverted comma
        #isStr = ((not token[i].startswith('"%')) and token[i].find('"..') != None)
        
        if ((result and flag and fun) or isInt) and (not replaced[i]):
            val_name = 'var'+ '_' + str(k)
            toValMap[val_name] = token[i]
            k = k+1
            j = 0
            while(j<length):
                if (token[j] == toValMap[val_name]):
                    token[j] = val_name
                    replaced[j] = 1
                j = j + 1
            
    return token, toValMap

In [5]:
#get 1000 token from the dataset
def get1000token(token2index, index2token, token2count):
  most_counted_token = dict(sorted(token2count.items(), key=lambda item: item[1], reverse = True))
  most_counted_token =  dict(list(most_counted_token.items())[:1000])

  most_token2index = {"PAD":0, "SOS":1, "EOS":2, "OOV":3}
  most_index2token = {0: "PAD", 1: "SOS", 2: "EOS", 3: "OOV"}

  num_count = 4
  for i in most_counted_token:
    most_token2index[i] = num_count
    most_index2token[num_count] = i
    num_count += 1

  return most_counted_token, most_token2index, most_index2token

In [6]:
#tokenize the data
def tokenize(text):
  buffer = []
  for lines in text:
    lines = eval(lines)
    individual = []
    for x in lines:
      try:
        x = token2index[x]
      except KeyError:
        x = 3
      individual.append(x)
    buffer.append(np.array([1]+individual+[2]))
  return np.asarray(buffer)

In [8]:
#Creating the encoding class 
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def __call__(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [9]:
#Creating the badhanau attention
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def __call__(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh( self.W1(query_with_time_axis) + self.W2(values)))

    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [10]:
#Creating the decoding class
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(self.dec_units)

  def __call__(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)

    return x, state, attention_weights

In [11]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([token2index['SOS']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [12]:
#importing the data
train_data = pd.read_csv('/content/gdrive/MyDrive/train.csv')

In [13]:
#creating the vocabulary on the test data
voc = vocabulary("test")
token2index, token2count, index2token, num_tokens, source_text, target_text, source_normal, target_normal = voc.add_source_line_token(train_data['sourceLineTokens'], train_data['targetLineTokens'])

In [14]:
token2count, token2index, index2token = get1000token(token2index, index2token, token2count)

In [15]:
source_token = tokenize(source_text)
target_token = tokenize(target_text)

  return array(a, dtype, copy=False, order=order)


In [16]:
padded_target_token = tf.keras.preprocessing.sequence.pad_sequences(target_token,padding='post', maxlen = 25)
padded_source_token = tf.keras.preprocessing.sequence.pad_sequences(source_token,padding='post', maxlen = 25)

In [17]:
BUFFER_SIZE = padded_source_token.shape[0]
BATCH_SIZE = 64
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 64
units = 300

vocab_inp_size = len(index2token)
vocab_tar_size = len(index2token)

max_length_targ = 25
max_length_inp = 25

dataset = tf.data.Dataset.from_tensor_slices((padded_source_token, padded_target_token)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 25]), TensorShape([64, 25]))

In [19]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
 
  return tf.reduce_mean(loss_)

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
attention_layer = BahdanauAttention(10)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [21]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [22]:
EPOCHS = 20
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):

    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix=checkpoint_prefix)

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 2.6137
Epoch 1 Batch 100 Loss 1.5713
Epoch 1 Batch 200 Loss 1.2470
Epoch 1 Loss 1.6573
Time taken for 1 epoch 61.01 sec

Epoch 2 Batch 0 Loss 1.3040
Epoch 2 Batch 100 Loss 1.2987
Epoch 2 Batch 200 Loss 1.2395
Epoch 2 Loss 1.2020
Time taken for 1 epoch 32.05 sec

Epoch 3 Batch 0 Loss 1.1698
Epoch 3 Batch 100 Loss 0.9281
Epoch 3 Batch 200 Loss 0.9406
Epoch 3 Loss 0.9782
Time taken for 1 epoch 32.17 sec

Epoch 4 Batch 0 Loss 0.9212
Epoch 4 Batch 100 Loss 0.8067
Epoch 4 Batch 200 Loss 0.6925
Epoch 4 Loss 0.8440
Time taken for 1 epoch 32.24 sec

Epoch 5 Batch 0 Loss 0.8177
Epoch 5 Batch 100 Loss 0.9377
Epoch 5 Batch 200 Loss 0.6217
Epoch 5 Loss 0.7536
Time taken for 1 epoch 32.38 sec

Epoch 6 Batch 0 Loss 0.5772
Epoch 6 Batch 100 Loss 0.5835
Epoch 6 Batch 200 Loss 0.7430
Epoch 6 Loss 0.6728
Time taken for 1 epoch 32.39 sec

Epoch 7 Batch 0 Loss 0.5537
Epoch 7 Batch 100 Loss 0.7524
Epoch 7 Batch 200 Loss 0.6530
Epoch 7 Loss 0.5815
Time taken for 1 epoch 32.47 sec

Epoch 

In [30]:
def evaluate(sentence):
    token =[]
    sentence = eval(sentence)
    inputs = [0]*len(sentence)
    token =[]

    for i in range(len(sentence)):
        try:
          inputs[i] = token2index[sentence[i]]
        except KeyError:
          inputs[i] = 3

    inputs = [1] + inputs + [2]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([token2index['SOS']], 0)

    for t in range(max_length_targ):
      predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
      predicted_id = tf.argmax(predictions[0]).numpy()
      result += index2token[predicted_id] + ' '
      if index2token[predicted_id] == 'EOS':
        return result

      # the predicted ID is fed back into the model
      dec_input = tf.expand_dims([predicted_id], 0)

    return result

In [None]:
def translate(sentence):
  result = evaluate(sentence)
  print('Predicted translation:', result)

In [None]:
"""val_data = pd.read_csv('/content/gdrive/MyDrive/valid.csv')
for i in range(50):#range(len(val_data.shape[0])):
  print("truth is:", val_data['sourceLineText'][i])
  translate(val_data['sourceLineText'][i])
  print('\n')"""

'val_data = pd.read_csv(\'/content/gdrive/MyDrive/valid.csv\')\nfor i in range(50):#range(len(val_data.shape[0])):\n  print("truth is:", val_data[\'sourceLineText\'][i])\n  translate(val_data[\'sourceLineText\'][i])\n  print(\'\n\')'

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
print(checkpoint_dir)

./training_checkpoints


In [None]:
translate("k = mode ( arr , * ( arr + i ) , n ) ]")

here i am
Predicted translation: k = 'n' ; EOS 


In [None]:
def revert_ids(token, i, source_normal):
    length, j = len(token), 0
    for info in source_normal[i]:
        j = 0
        while j<length:
            if token[j] == info:
                token[j]= source_normal[i][info]
            j = j+1
    return token

In [32]:
def validate():
  val_data = pd.read_csv('/content/gdrive/MyDrive/valid.csv')
  score = []
  for i in range(val_data.shape[0]):
    valid_source_text = val_data['sourceLineTokens'][i]
    valid_target_text = eval(val_data['targetLineTokens'][i])
    pred_text = evaluate(valid_source_text)
    reference = [valid_target_text]
    pred_text_list = pred_text.split()[:-1]
    if len(pred_text_list)>0 and pred_text_list[-1] == 'OOV':
      pred_text_list = pred_text_list[:-1]

    print("the truth is :", valid_target_text)
    print("the predc is :", pred_text_list)
    loc_score = sentence_bleu(reference, pred_text_list, weights=(0.25, 0.25, 0.25, 0.25))
    score.append(loc_score)
  print(sum(score)/len(score))

In [None]:
val_data = pd.read_csv('/content/gdrive/MyDrive/valid.csv')
len(list(val_data['sourceLineText']))

2584

In [None]:
from queue import Queue 
from threading import Thread
def valid():
  val_data = pd.read_csv('/content/gdrive/MyDrive/valid.csv')
  que = Queue()           # Python 3.x
  threads_list = list()
  start = time.time()
  for i in range(val_data.shape[0]):
      print(i)
      t = Thread(target=lambda q, arg1: q.put(evaluate(arg1)), args=(que, val_data['sourceLineText'][i]))
      t.start()
      threads_list.append(t)
  for t in threads_list:
    t.join()

  # Check thread's return value
  while not que.empty():
    result = que.get()
    print(result)
  print(time.time()- start)

In [33]:
validate()

the truth is : ['if', '(', '(', 'factorial', '(', 'x', ')', '>=', 'n1', ')', '&&', '(', 'factorial', '(', 'x', ')', '<=', 'n2', ')', ')', '{']
the predc is : ['if', '(', '(', 'OOV', '(', 'x', ')', '>=', 'n1', ')', '&&', '(', 'OOV', '(', 'x', ')', '<=', 'n1', ')', '&&', '(', 'OOV', '(', 'x']
the truth is : ['scanf', '(', '"%d"', ',', '&', 'a', '[', '0', ']', ')', ';']
the predc is : ['scanf', '(', '"%d"', ',', '&', 'a', '[', 'i', ']', ')', ';']
the truth is : ['a', '=', 'a', '/', '10', ';']
the predc is : ['a', '=', 'a', '/', '10', ';']
the truth is : ['int', 'k', ',', 'c', ',', 'x', ',', 'y', ';']
the predc is : ['int', 'k', ',', 'c', ';']
the truth is : ['ch', '=', 'rot', '(', 'a', '[', 'i', ']', ')', ';']
the predc is : ['ch', '=', 'OOV', '(', 'a', '[', 's', ']', ')', ';']


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


the truth is : ['str1', '[', 'i', ']', '=', 'c', ';']
the predc is : ['str', '[', 'i', ']', '=', 'c', ';']
the truth is : ['p', '=', 'max', '(', 'p', ',', 'a', '[', 'k', '+', '1', ']', ')', ';']
the predc is : ['p', '=', 'max', '(', 'p', ',', 'a', '[', 'k', '+', '1', ']', ')', ';']
the truth is : ['return', '0', ';']
the predc is : ['return', '0', ';']
the truth is : ['else', '{']
the predc is : ['else', '{']


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


the truth is : ['printf', '(', '"Reverse of %d is %d"', ',', 'num', ',', 'rev', ')', ';']
the predc is : ['printf', '(', '"Reverse', 'of', '%d', 'is', '%d"', ',', 'ss', ')', ';']
the truth is : ['char', 'sum1', '[', '100', ']', ';']
the predc is : ['char', 'str1', '[', ']', ';']
the truth is : ['scanf', '(', '"%d"', ',', '&', 'a', '[', 'i', ']', ')', ';']
the predc is : ['scanf', '(', '"%d"', ',', '&', 'a', '[', 'i', ']', ')', ';']
the truth is : ['for', '(', 'r', '=', 'N', '+', '1', ';', 'r', '>', 'N', ';', 'r', '++', ')', '{']
the predc is : ['for', '(', 'r', '=', 'N', '+', '1', ';', 'r', '++', ')', '{']
the truth is : ['k', '=', 'mode', '(', 'arr', ',', '*', '(', 'arr', '+', 'i', ')', ',', 'n', ')', ';']
the predc is : ['k', '=', 'mid', '(', 'arr', ',', '*', '(', 'arr', '+', 'i', ')', ',', 'n', ')', ';']
the truth is : ['printf', '(', '"%d"', ',', 'part', '(', '2', ')', ')', ';']
the predc is : ['printf', '(', '"%d"', ',', 'OOV', '(', '2', ')', ')', ';']
the truth is : ['return', '0

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
the predc is : ['if', '(', 'a', '[', 'i', '-', '1', ']', ')', '==', "'\\n'", '&&', 'a', '[', 'i', ']', ')', '==', "'\\n'", '&&', 'a', '[', 'i', ']']
the truth is : ['for', '(', 'k', '=', '0', ';', 'k', '<', 'i', '-', 'j', '-', '1', ';', 'k', '++', ')', '{']
the predc is : ['for', '(', 'k', '=', '0', ';', 'k', '<', 'i', '-', 'j', ')']
the truth is : ['count2', '++', ';']
the predc is : ['count', '++', ';']
the truth is : ['printf', '(', '"%d"', ',', 'mat', '[', '0', ']', '[', '1', ']', ')', ';']
the predc is : ['printf', '(', 'OOV', ')', ';']
the truth is : ['a', '[', 'i', ']', '[', 'j', ']', '=', 'b', '[', 'i', ']', '[', 'j', ']', ';']
the predc is : ['a', '[', 'i', ']', '[', 'j', ']', '=', 'a2', '[', 'i', ']', '[', 'j', ']', ')', ';']
the truth is : ['while', '(', 'temp', '->', 'next', '!=', 'NULL', ')']
the predc is : ['while', '(', 'temp', '->', 'next', '!=', '-', '3', ')']
the truth is : ['if', '(', '(', 'arr', '[', '

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from queue import Queue 
from threading import Thread

result = []
def valid():
    val_data = pd.read_csv('/content/gdrive/MyDrive/valid.csv')
    que = Queue()         
    threads_list = list()
    start = time.time()
    for i in range(val_data.shape[0]):
        t = Thread(target=lambda q, arg1: q.put(evaluate(arg1)), args=(que, val_data['sourceLineText'][i]))
        t.start()
        value = que.get().split()[:-1]
        if len(value) > 0 and value[-1] =='OOV':
            value = value[:-1]
        result.append(str(value))
        threads_list.append(t)
    
    for t in threads_list:
        t.join()
    end = time.time()
    print("total time:", end - start)

In [36]:
saver = tf.train.Saver()
save_path = saver.save(sess, "gdrive/My Drive/TF 01/some method/training_checkpoints")


AttributeError: ignored