In [311]:
import os
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import TextVectorization, Input, Embedding, LSTM, Dense, Concatenate, Attention
from tensorflow.keras.models import Model
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, LayerNormalization, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



In [312]:
# Assuming the CSV has been read into `dataframe`
dataframe = pd.read_csv('pro_corpus.csv')
aave_texts = dataframe['AAVE'].str.lower().tolist()
sae_texts = ["[start] " + text + " [end]" for text in dataframe['SAE'].str.lower().tolist()]

aave_train, aave_test, sae_train, sae_test = train_test_split(
    aave_texts, sae_texts, test_size=0.2, random_state=21)


In [313]:
max_vocab_size = 20000
sequence_length = 30

aave_vectorization = tf.keras.layers.TextVectorization(max_tokens=max_vocab_size, output_sequence_length=sequence_length)
sae_vectorization = tf.keras.layers.TextVectorization(max_tokens=max_vocab_size, output_sequence_length=sequence_length + 1)  # +1 for [start]/[end] tokens

aave_vectorization.adapt(aave_train)
sae_vectorization.adapt(sae_train)


In [314]:
    def get_angles(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def positional_encoding(position, d_model):
        angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                                np.arange(d_model)[np.newaxis, :],
                                d_model)

        # Apply sin to even indices in the array; 2i
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        # Apply cos to odd indices in the array; 2i+1
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]

        return tf.cast(pos_encoding, dtype=tf.float32)


In [294]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        
        self.dense = Dense(d_model)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
        
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        
        # Correctly call the scaled_dot_product_attention function
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_v, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_v, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_v, d_model)
            
        return output, attention_weights


In [315]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights


In [316]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])


In [317]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [318]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        
        # Adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)


In [319]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2


In [320]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(pe_target, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        # Adding embedding and positional encoding.
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i, dec_layer in enumerate(self.dec_layers):
            x, block1, block2 = dec_layer(x, enc_output, training, look_ahead_mask, padding_mask)

            # Store attention weights, could be useful for visualization or analysis
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2

        # x shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights


In [301]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = Dense(target_vocab_size)
        
    def call(self, inputs, training):
        print("Inputs dict keys:", inputs.keys())
        inp, tar = inputs['inputs'], inputs['dec_inputs']
        print("inp shape:", inp.shape)
        print("tar shape:", tar.shape)
        # Proceed with masks creation and model operations


        enc_padding_mask, look_ahead_mask, dec_padding_mask = self.create_masks(inp, tar)
        
        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        
        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        
        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
        
        return final_output, attention_weights

    def create_masks(self, inp, tar):
        print("inp shape:", inp.shape)  # Debugging line
        print("tar shape:", tar.shape) 
        # Encoder padding mask for masking out the padding tokens in the encoder input
        enc_padding_mask = create_padding_mask(inp)

        # Decoder padding mask for the second multi-head attention mechanism in the decoder
        # This is used to mask the encoder outputs.
        dec_padding_mask = create_padding_mask(inp)

        # Look-ahead mask and decoder target padding mask combined to mask out tokens in the decoder that should not be seen yet
        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)  # Combine masks to ensure both conditions are applied

        return enc_padding_mask, combined_mask, dec_padding_mask


In [326]:
import tensorflow as tf
import numpy as np

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention_logits = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(tf.cast(self.depth, tf.float32))
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)

        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)

        return output, attention_weights

# Skipping other necessary components (e.g., EncoderLayer, DecoderLayer, Encoder, Decoder) for brevity

class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training):
        inp, tar = inputs['inputs'], inputs['dec_inputs']

        enc_padding_mask, combined_mask, dec_padding_mask = self.create_masks(inp, tar)

        enc_output = self.encoder(inp, training, enc_padding_mask)
        dec_output, attention_weights = self.decoder(tar, enc_output, training, combined_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output

    def create_masks(self, inp, tar):
        enc_padding_mask = create_padding_mask(inp)
        dec_padding_mask = create_padding_mask(inp)

        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

        return enc_padding_mask, combined_mask, dec_padding_mask

# Define other necessary components (e.g., Encoder, Decoder) and training process as needed


In [327]:
# Assuming aave_vectorization and sae_vectorization have been adapted on the respective datasets
def make_dataset(aave, sae):
    aave_ds = aave_vectorization(aave)
    sae_ds = sae_vectorization(sae)
    # Decoder inputs use the [:, :-1] slices of sae_ds, and the targets are the [:, 1:] slices
    input_ds = {"inputs": aave_ds, "dec_inputs": sae_ds[:, :-1]}
    target_ds = sae_ds[:, 1:]  # Targets are offset by 1 to predict the next token
    return tf.data.Dataset.from_tensor_slices((input_ds, target_ds)).batch(64).cache().prefetch(tf.data.experimental.AUTOTUNE)

train_ds = make_dataset(aave_train, sae_train)
val_ds = make_dataset(aave_test, sae_test)


In [303]:
# def debug_loss_function(real, pred):
#     print("Shape of real (labels):", real.shape)
#     print("Shape of pred (logits):", pred.shape)
#     loss_ = loss_object(real, pred)
#     return loss_

# # Use the debug loss function for model compilation to inspect shapes
# transformer.compile(optimizer=optimizer, loss=debug_loss_function, metrics=["accuracy"])


In [304]:
def make_dataset(aave, sae):
    aave_ds = aave_vectorization(aave)
    sae_ds = sae_vectorization(sae)
    print("Shape of aave_ds after vectorization:", aave_ds.shape)
    print("Shape of sae_ds after vectorization:", sae_ds.shape)

    # Decoder inputs use the [:, :-1] slices of sae_ds, and the targets are the [:, 1:] slices
    input_ds = {"inputs": aave_ds, "dec_inputs": sae_ds[:, :-1]}
    target_ds = sae_ds[:, 1:]  # Targets are offset by 1 to predict the next token
    print("Shape of decoder inputs:", sae_ds[:, :-1].shape)
    print("Shape of targets:", sae_ds[:, 1:].shape)

    return tf.data.Dataset.from_tensor_slices((input_ds, target_ds)).batch(64).cache().prefetch(tf.data.experimental.AUTOTUNE)


train_ds = make_dataset(aave_train, sae_train)
val_ds = make_dataset(aave_test, sae_test)


Shape of aave_ds after vectorization: (4630, 30)
Shape of sae_ds after vectorization: (4630, 31)
Shape of decoder inputs: (4630, 30)
Shape of targets: (4630, 30)
Shape of aave_ds after vectorization: (1158, 30)
Shape of sae_ds after vectorization: (1158, 31)
Shape of decoder inputs: (1158, 30)
Shape of targets: (1158, 30)


In [305]:
def create_look_ahead_mask(size):
    # size: scalar, size of the mask (seq_len, seq_len)

    # Create a lower triangular matrix filled with ones
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)

    return mask  # (seq_len, seq_len)


In [306]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return tf.expand_dims(tf.expand_dims(seq, 1), 1)


In [328]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        arg3 = tf.math.rsqrt(self.d_model)
        return arg3 * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model=512)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)






def accuracy(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

# Hyperparameters for the Transformer
num_layers = 4
d_model = 128
num_heads = 8
dff = 512
input_vocab_size = aave_vectorization.vocabulary_size() + 2  # +2 for start/end tokens
target_vocab_size = sae_vectorization.vocabulary_size() + 2  # +2 for start/end tokens
pe_input = max([len(sentence.split()) for sentence in aave_texts])  # or a fixed number like 1000
pe_target = max([len(sentence.split()) for sentence in sae_texts])  # or a fixed number like 1000
dropout_rate = 0.1

# Instantiate the Transformer model
transformer = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff,
                          input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size,
                          pe_input=pe_input, pe_target=pe_target, rate=dropout_rate)


In [308]:
for example_input_batch, example_target_batch in train_ds.take(1):
    print("Input shape:", example_input_batch['inputs'].shape)
    print("Target shape:", example_target_batch.shape)


Input shape: (64, 30)
Target shape: (64, 30)


2024-02-06 20:02:54.027471: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [329]:
transformer.compile(optimizer=optimizer, loss=loss_function, metrics=["accuracy"])


In [330]:
EPOCHS = 1

# Define the checkpoint path and the checkpoint manager.
# This saves checkpoints to disk.
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# If a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds,
                callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)])


2024-02-06 20:09:07.664558: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2024-02-06 20:09:07.720003: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.




<keras.callbacks.History at 0x7f5a36ae8b80>

In [289]:
@tf.function
def train_step(inp, tar):
    with tf.GradientTape() as tape:
        predictions, _ = transformer([inp, tar[:, :-1]], training=True)
        loss = loss_function(tar[:, 1:], predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    return loss

for epoch in range(EPOCHS):
    for (batch, (inp, tar)) in enumerate(train_ds):
        loss = train_step(inp['inputs'], inp['dec_inputs'])

        if batch % 100 == 0:
            print(f"Epoch {epoch} Batch {batch} Loss {loss.numpy()}")


2024-02-06 20:01:35.083395: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


AttributeError: in user code:

    File "/tmp/ipykernel_33808/2318031034.py", line 4, in train_step  *
        predictions, _ = transformer([inp, tar[:, :-1]], training=True)
    File "/N/soft/sles15/python/gnu/3.10.5/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filethr9xh_d.py", line 10, in tf__call
        ag__.ld(print)('Inputs dict keys:', ag__.converted_call(ag__.ld(inputs).keys, (), None, fscope))

    AttributeError: Exception encountered when calling layer "transformer_16" (type Transformer).
    
    in user code:
    
        File "/tmp/ipykernel_33808/2912783195.py", line 11, in call  *
            print("Inputs dict keys:", inputs.keys())
    
        AttributeError: 'list' object has no attribute 'keys'
    
    
    Call arguments received by layer "transformer_16" (type Transformer):
      • inputs=['tf.Tensor(shape=(64, 30), dtype=int64)', 'tf.Tensor(shape=(64, 29), dtype=int64)']
      • training=True


In [141]:
for example_input_batch, example_target_batch in train_ds.take(1):
    print("Input batch keys:", example_input_batch.keys())
    print("Input shape:", example_input_batch['inputs'].shape)
    print("Target shape:", example_target_batch.shape)


Input batch keys: dict_keys(['inputs', 'dec_inputs'])
Input shape: (64, 30)
Target shape: (64, 30)


2024-02-06 16:30:49.310522: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [142]:
test_seq = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0]], dtype=tf.int32)
print("Padding mask:", create_padding_mask(test_seq))
print("Look-ahead mask:", create_look_ahead_mask(size=5))  # Adjust size as needed


Padding mask: tf.Tensor(
[[[[0. 0. 1. 1. 0.]]]


 [[[0. 0. 0. 1. 1.]]]], shape=(2, 1, 1, 5), dtype=float32)
Look-ahead mask: tf.Tensor(
[[0. 1. 1. 1. 1.]
 [0. 0. 1. 1. 1.]
 [0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32)


In [143]:
def call(self, inputs, training):
    print("Inputs dict keys:", inputs.keys())
    inp, tar = inputs['inputs'], inputs['dec_inputs']
    print("inp shape:", inp.shape)
    print("tar shape:", tar.shape)
    # Proceed with masks creation and model operations


In [22]:
# Data Reading
dataframe = pd.read_csv('pro_corpus.csv')
assert 'AAVE' in dataframe.columns and 'SAE' in dataframe.columns

# Preparing the dataset
aave_texts = dataframe['AAVE'].str.lower().tolist()
sae_texts = dataframe['SAE'].str.lower().tolist()

# Split the data into train and test sets
aave_train, aave_test, sae_train, sae_test = train_test_split(
    aave_texts, sae_texts, test_size=0.2, random_state=21)

# Convert the train and test data into TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices({
    'aave': aave_train,
    'sae': sae_train
})
test_dataset = tf.data.Dataset.from_tensor_slices({
    'aave': aave_test,
    'sae': sae_test
})

BUFFER_SIZE = len(aave_train)  # Use the size of the train dataset

# Adjust batch sizes
train_batch_size = 16
test_batch_size = 4

# Shuffle and batch the train dataset
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(train_batch_size, drop_remainder=True)

# Batch the test dataset
test_dataset = test_dataset.batch(test_batch_size, drop_remainder=True)

# Text Vectorization
aave_vectorization = TextVectorization(output_mode='int', output_sequence_length=30)
sae_vectorization = TextVectorization(output_mode='int', output_sequence_length=30)

aave_texts = train_dataset.map(lambda x: x['aave'])
sae_texts = train_dataset.map(lambda x: x['sae'])

aave_vectorization.adapt(aave_texts)
sae_vectorization.adapt(sae_texts)

aave_vocab_size = len(aave_vectorization.get_vocabulary())
sae_vocab_size = len(sae_vectorization.get_vocabulary())


In [23]:
# Model Parameters
embedding_dim = 256
units = 1024


In [24]:
from tensorflow.keras.layers import MultiHeadAttention

# Building the Enhanced Model
# Encoder
encoder_input = Input(shape=(None,), dtype='int64', name='encoder_input')
encoder_embedding = Embedding(input_dim=aave_vocab_size, output_dim=embedding_dim)(encoder_input)
encoder_lstm = LSTM(units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_lstm2 = LSTM(units, return_sequences=True, return_state=True)
encoder_outputs2, state_h2, state_c2 = encoder_lstm2(encoder_outputs)
encoder_state = [state_h2, state_c2]


# MultiHeadAttention Parameters
num_heads = 8  # Number of attention heads

# MultiHeadAttention Layer
class MultiHeadAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttentionLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dense_output = tf.keras.layers.Dense(d_model)

    def call(self, query, value, key, attention_mask=None):
        attention_output = self.multi_head_attention(query=query, value=value, key=key, attention_mask=attention_mask)
        output = self.dense_output(attention_output)
        return output

# Modify the Decoder to include MultiHeadAttention
# Decoder with MultiHeadAttention
decoder_input = Input(shape=(None,), dtype='int64', name='decoder_input')
decoder_embedding = Embedding(input_dim=sae_vocab_size, output_dim=embedding_dim)(decoder_input)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_lstm_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_state)
decoder_lstm2 = LSTM(units, return_sequences=True, return_state=True)
decoder_lstm_output2, _, _ = decoder_lstm2(decoder_lstm_output)



# Applying MultiHeadAttention
multi_head_attention = MultiHeadAttentionLayer(d_model=embedding_dim, num_heads=num_heads)
attention_output = multi_head_attention(decoder_lstm_output2, encoder_outputs2, encoder_outputs2)

# Concatenation
decoder_concat_input = Concatenate(axis=-1)([decoder_lstm_output2, attention_output])

# Output Layer
decoder_dense = Dense(sae_vocab_size, activation='softmax')
decoder_output = decoder_dense(decoder_concat_input)

model_M4 = Model([encoder_input, decoder_input], decoder_output)
model_M4.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model_M4.summary()

# The rest of the training and evaluation process remains similar to your original model.


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, None, 256)    740096      ['encoder_input[0][0]']          
                                                                                                  
 decoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 lstm_16 (LSTM)                 [(None, None, 1024)  5246976     ['embedding_8[0][0]']            
                                , (None, 1024),                                             

In [26]:
def split_input_target(batch):
    input_text = batch['aave']
    target_text = batch['sae']

    input_data = aave_vectorization(input_text)
    target_data = sae_vectorization(target_text)

    # Ensure all sequences in the batch have the same length
    sequence_length = 30
    input_data = tf.ensure_shape(input_data, [None, sequence_length])
    target_data = tf.ensure_shape(target_data, [None, sequence_length])

    return {'encoder_input': input_data, 'decoder_input_M4': target_data[:, :-1]}, target_data[:, 1:]

# Apply the function to each item in the dataset
train_dataset = train_dataset.map(split_input_target).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.map(split_input_target).prefetch(tf.data.AUTOTUNE)


In [27]:
# Train the model
epochs = 10

# Callbacks for Early Stopping and Model Checkpoint
checkpoint_filepath = '/N/u/saswar/Carbonate/AAVE/'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=15,
    verbose=1)

# Train the model with validation split and callbacks
history = model_M4.fit(train_dataset, epochs=epochs, validation_data=test_dataset, callbacks=[early_stopping_callback, model_checkpoint_callback])


Epoch 1/10


ValueError: in user code:

    File "/N/soft/sles15/python/gnu/3.10.5/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/N/soft/sles15/python/gnu/3.10.5/lib/python3.10/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/N/soft/sles15/python/gnu/3.10.5/lib/python3.10/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/N/soft/sles15/python/gnu/3.10.5/lib/python3.10/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/N/soft/sles15/python/gnu/3.10.5/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/N/soft/sles15/python/gnu/3.10.5/lib/python3.10/site-packages/keras/engine/input_spec.py", line 183, in assert_input_compatibility
        raise ValueError(f'Missing data for input "{name}". '

    ValueError: Missing data for input "decoder_input". You passed a data dictionary with keys ['encoder_input', 'decoder_input_M4']. Expected the following keys: ['encoder_input', 'decoder_input']
