In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

Define encoder and decoder layers:

In [None]:
class EncoderLayer(keras.layers.Layer):
    def __init__(self, emb_dim, num_heads, hid_dim, dropout):
        super(EncoderLayer, self).__init__()
        # hid_dim : dimensionality of the hidden layers, to set dense layers
        self.attention = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=emb_dim)
        # num_heads = In attention how much capability can learn, diff aspect of input (can increase complexity)), emb_dim = vector size of input and output
        self.dropout1 = keras.layers.Dropout(dropout)
        # during training some input randomly select and assign 0
        self.norm1 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.dense1 = keras.layers.Dense(hid_dim, activation='relu')
        self.dense2 = keras.layers.Dense(emb_dim)
        self.dropout2 = keras.layers.Dropout(dropout)
        self.norm2 = keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, training=True):
        attention_output = self.attention(inputs, inputs)
        attention_output = self.dropout1(attention_output, training=training)
        attention_output = self.norm1(inputs + attention_output)

        intermediate_output = self.dense1(attention_output)
        intermediate_output = self.dense2(intermediate_output)
        intermediate_output = self.dropout2(intermediate_output, training=training)
        intermediate_output = self.norm2(attention_output + intermediate_output) # concate 2 outputs at the last layer

        return intermediate_output


class DecoderLayer(keras.layers.Layer):
    def __init__(self, emb_dim, num_heads, hid_dim, dropout):
        super(DecoderLayer, self).__init__()

        self.attention1 = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=emb_dim)
        self.dropout1 = keras.layers.Dropout(dropout)
        self.norm1 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.attention2 = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=emb_dim)
        self.dropout2 = keras.layers.Dropout(dropout)
        self.norm2 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.dense1 = keras.layers.Dense(hid_dim, activation='relu')
        self.dense2 = keras.layers.Dense(emb_dim)
        self.dropout3 = keras.layers.Dropout(dropout)
        self.norm3 = keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, encoder_outputs, training=True):
        attention_output1 = self.attention1(inputs, inputs)
        attention_output1 = self.dropout1(attention_output1, training=training)
        attention_output1 = self.norm1(inputs + attention_output1)

        attention_output2 = self.attention2(attention_output1, encoder_outputs)
        attention_output2 = self.dropout2(attention_output2, training=training)
        attention_output2 = self.norm2(attention_output1 + attention_output2)

        intermediate_output = self.dense1(attention_output2)
        intermediate_output = self.dense2(intermediate_output)
        intermediate_output = self.dropout3(intermediate_output, training=training)
        intermediate_output = self.norm3(attention_output2 + intermediate_output)

        return intermediate_output


Define encoder and decoder models:

In [None]:
class Encoder(keras.layers.Layer):
    def __init__(self, num_layers, emb_dim, num_heads, hid_dim, input_vocab_size, dropout):
        super(Encoder, self).__init__()

        self.emb_dim = emb_dim
        self.embedding = keras.layers.Embedding(input_vocab_size, emb_dim)
        # how many tokens or words can handle in sequences
        self.dropout = keras.layers.Dropout(dropout)
        self.encoder_layers = [EncoderLayer(emb_dim, num_heads, hid_dim, dropout) for _ in range(num_layers)]

    def call(self, inputs, training=True):
        inputs = self.embedding(inputs)
        inputs *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
        inputs = self.dropout(inputs, training=training)

        for encoder_layer in self.encoder_layers:
            inputs = encoder_layer(inputs, training=training)

        return inputs


class Decoder(keras.layers.Layer):
    def __init__(self, num_layers, emb_dim, num_heads, hid_dim, output_vocab_size, dropout):
        super(Decoder, self).__init__()
        # num_layers: the number of modal layers (can increase complexity)
        self.emb_dim = emb_dim
        self.embedding = keras.layers.Embedding(output_vocab_size, emb_dim)
        self.dropout = keras.layers.Dropout(dropout)
        self.decoder_layers = [DecoderLayer(emb_dim, num_heads, hid_dim, dropout) for _ in range(num_layers)]

    def call(self, inputs, encoder_outputs, training=True):
        inputs = self.embedding(inputs)
        inputs *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
        inputs = self.dropout(inputs, training=training)

        for decoder_layer in self.decoder_layers:
            inputs = decoder_layer(inputs, encoder_outputs, training=training)

        return inputs


Define the EncoderDecoder model:

In [None]:
class EncoderDecoder(keras.Model):
    def __init__(self, num_layers, emb_dim, num_heads, hid_dim, input_vocab_size, output_vocab_size, dropout):
        super(EncoderDecoder, self).__init__()

        self.encoder = Encoder(num_layers, emb_dim, num_heads, hid_dim, input_vocab_size, dropout)
        self.decoder = Decoder(num_layers, emb_dim, num_heads, hid_dim, output_vocab_size, dropout)
        self.final_dense = keras.layers.Dense(output_vocab_size)

    def call(self, inputs, training=True):
        encoder_outputs = self.encoder(inputs, training=training)
        decoder_outputs = self.decoder(inputs, encoder_outputs, training=training)
        final_outputs = self.final_dense(decoder_outputs)
        return final_outputs





load train.json data:

In [None]:
import json
train_data_path = "/Users/sena/Desktop/sena-personal/sena-repos/ASDL/train_data/train.json"

input_data = []
target_data = []

with open(train_data_path, 'r') as file:
    for line in file:
        example = json.loads(line)
        input_data.append(example['nl'])
        target_data.append(example['code'])

# check the label is nparrray:
print(type(input_data))

# convert to numpy array
input_data = np.asarray(input_data)
target_data = np.asarray(target_data)

print(type(input_data))

print(input_data[:2])
print(target_data[:2])



In [None]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
print(input_data[:1])
input_words = []
for sentence in input_data:
   encoded_sentence = tokenizer(sentence)["input_ids"]
   input_words.append(np.array(encoded_sentence))

print(input_words[:1])

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
print(target_data[:1])
target_words = []
for sentence in target_data:
   encoded_sentence = tokenizer(sentence)["input_ids"]
   target_words.append(np.array(encoded_sentence))

print(target_words[:1])

In [None]:
from transformers import GPT2TokenizerFast
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert input_words and target_words to numpy arrays
input_words = np.array(input_words)
target_words = np.array(target_words)
print(input_words.shape)
print(target_words.shape)

# # Determine the maximum index in the target_words array
# max_index = np.max(target_words)

# Calculate the output vocabulary size
# output_vocab_size = max_index + 1
output_vocab_size = tokenizer.vocab_size
print(output_vocab_size)

# Pad the input_words and one_hot_target_words to the same length
input_words = pad_sequences(input_words, padding='post')
target_words = pad_sequences(target_words, padding='post')

print(input_words.shape)
print(target_words.shape)

# Convert target_words to one-hot encoded vectors (cut shape was incorrect)
num_classes = output_vocab_size

one_hot_target_words = []
for encoded_target_words in target_words:
    one_hot_target_words.append(to_categorical(encoded_target_words, num_classes=num_classes))

one_hot_target_words = np.array(one_hot_target_words)
print(one_hot_target_words.shape)

# Define model hyperparameters
num_layers = 2
emb_dim = 32
num_heads = 4
hid_dim = 64
input_vocab_size = tokenizer.vocab_size
dropout = 0.1

# Create an instance of the EncoderDecoder model
model = EncoderDecoder(num_layers, emb_dim, num_heads, hid_dim, input_vocab_size, output_vocab_size, dropout)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# model.summary()

# Train the model
# model.fit(input_words, target_words, epochs=2, batch_size=2)

model.fit(input_words, one_hot_target_words, epochs=2, batch_size=2)

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Create an instance of OneHotEncoder
encoder = OneHotEncoder(sparse=False, dtype=int)

# Fit and transform input data
input_words = [word for sentence in input_data for word in sentence.split()]
input_words = np.array(input_words).reshape(-1, 1)  # Reshape for one-hot encoding
input_encoded = encoder.fit_transform(input_words)

# Fit and transform target data
target_words = [word for sentence in target_data for word in sentence.split()]
target_words = np.array(target_words).reshape(-1, 1)  # Reshape for one-hot encoding
target_encoded = encoder.fit_transform(target_words)

# Print the one-hot encoded vectors
print("Input Encoded:\n", input_encoded)
print("Target Encoded:\n", target_encoded)
print(input_encoded.shape)

In [None]:
# max_length = 0

# with open(train_data_path, 'r') as file:
#     for line in file:
#         example = json.loads(line)
#         input_sequence = example['nl']
#         sequence_length = len(input_sequence.split())  # Split the sequence into tokens and get the length
#         max_length = max(max_length, sequence_length)

# print("Maximum sequence length:", max_length)


In [None]:
# # Preprocess and encode the target data

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(target_data)
# encoded_target_data = tokenizer.texts_to_sequences(target_data)
# padded_target_data = pad_sequences(encoded_target_data, maxlen=max_length)



In [None]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(target_data)  # array(['boolean function ( ) { return isParsed ; }', ...
# # word_index = tokenizer.word_index

# # index = 10
# # word = next((key for key, value in word_index.items() if value == index), None)
# # print("Word:", word)  # building the vocabulary (word index)

# # converting the text data in target_data into sequences of integers based on the vocabulary (word index) created by the tokenizer.
# encoded_target_data = tokenizer.texts_to_sequences(target_data)

# # Let's break down the interpretation of the first sequence [15, 2, 3, 3561] as an example:

# # The number 15 corresponds to the 15th most frequent word in the vocabulary.
# # The number 2 corresponds to the 2nd most frequent word in the vocabulary.
# # The number 3 corresponds to the 3rd most frequent word in the vocabulary.
# # The number 3561 corresponds to the 3561st most frequent word in the vocabulary.
# # Similarly, each sequence in the Encoded Target Data represents the encoded form of a text sample, where each number corresponds to a specific word in the vocabulary.
# # print("Encoded Target Data:", encoded_target_data[:3]) # Encoded Target Data: [[15, 2, 3, 3561], [54, 2, 3, 33346], [6, 2, 1549, 1, 86, 33347, 7, 20, 17, 23, 18624, 5, 33348, 33349, 1, 46, 4, 45, 10, 33350, 7, 5, 24, 18624, 19, 5, 12, 19, 18625, 17, 33351, 1, 10, 19, 226]

# # The line padded_input_data = pad_sequences(encoded_input_data, maxlen=max_length) is 
# # performing padding on the encoded input data sequences. It ensures that all sequences 
# # have the same length by either adding zeros (padding) or truncating them.
# padded_target_data = pad_sequences(encoded_target_data, maxlen=max_length)

In [None]:
# # Preprocess and encode the input data
# import json
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(input_data)
# encoded_input_data = tokenizer.texts_to_sequences(input_data)
# padded_input_data = pad_sequences(encoded_input_data, maxlen=max_length)


In [None]:
from tensorflow.keras.utils import to_categorical
# Define model hyperparameters
num_layers = 2
emb_dim = 32
num_heads = 4
hid_dim = 64
input_vocab_size = 100
#output_vocab_size = 100
dropout = 0.1

max_index = np.max(target_encoded)
output_vocab_size = max_index + 1

# # Convert the target data to one-hot encoded vectors
# num_classes = output_vocab_size
# one_hot_target_data = to_categorical(padded_target_data, num_classes=num_classes)

# # Create an instance of the EncoderDecoder model
# model = EncoderDecoder(num_layers, emb_dim, num_heads, hid_dim, input_vocab_size, output_vocab_size, dropout)

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy')

# # Train the model
# model.fit(padded_input_data[:10], one_hot_target_data[:10], epochs=2, batch_size=2)