In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Add, Dense, Dropout, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import pandas as pd

## Training

In [16]:
# positional encoding layer - used  to provide positional information to the model by adding position-dependent signals to word embeddings, 
# allowing the model to incorporate the order of words in the input sequence
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, seq_len, embed_dim):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(seq_len, embed_dim)
    
    def get_angles(self, pos, i, embed_dim):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(embed_dim))
        return pos * angle_rates
    
    def positional_encoding(self, seq_len, embed_dim):
        angle_rads = self.get_angles(np.arange(seq_len)[:, np.newaxis],
                                     np.arange(embed_dim)[np.newaxis, :], embed_dim)
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)
    
    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

Encoder-Decoder Model

In [17]:
def build_transformer_model(vocab_size, seq_len, embed_dim=128, num_heads=4, ff_dim=512, num_layers=4, dropout_rate=0.1):
    encoder_inputs = Input(shape=(seq_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(encoder_inputs)
    x = PositionalEncoding(seq_len, embed_dim)(x)
    
    for _ in range(num_layers):
        attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
        attention_output = Dropout(dropout_rate)(attention_output)
        attention_output = LayerNormalization()(Add()([x, attention_output]))
        
        ff_output = Dense(ff_dim, activation='relu')(attention_output)
        ff_output = Dense(embed_dim)(ff_output)
        ff_output = Dropout(dropout_rate)(ff_output)
        x = LayerNormalization()(Add()([attention_output, ff_output]))
    encoder_outputs = x
    
    decoder_inputs = Input(shape=(seq_len,))
    y = Embedding(input_dim=vocab_size, output_dim=embed_dim)(decoder_inputs)
    y = PositionalEncoding(seq_len, embed_dim)(y)
    
    for _ in range(num_layers):
        masked_attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(y, y)
        masked_attention_output = Dropout(dropout_rate)(masked_attention_output)
        masked_attention_output = LayerNormalization()(Add()([y, masked_attention_output]))
        
        cross_attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(masked_attention_output, encoder_outputs)
        cross_attention_output = Dropout(dropout_rate)(cross_attention_output)
        y = LayerNormalization()(Add()([masked_attention_output, cross_attention_output]))
        
        ff_output = Dense(ff_dim, activation='relu')(y)
        ff_output = Dense(embed_dim)(ff_output)
        ff_output = Dropout(dropout_rate)(ff_output)
        y = LayerNormalization()(Add()([y, ff_output]))
    
    outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(y)
    
    model = Model([encoder_inputs, decoder_inputs], outputs)
    return model

Setting Up Data

In [18]:
# pruned + combined data
df = pd.read_csv('/Users/shreyanakum/Downloads/Sophomore Year/Summer 2024/Aristocrat-Model/data_collection/pruned_substitution_cipher_dataset_merged.csv')

ciphertexts = df['ciphertext']
plaintexts = df['plaintext']

In [19]:
# tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(ciphertexts + plaintexts)

# convert text to sequences of integers
ciphertext_sequences = tokenizer.texts_to_sequences(ciphertexts)
plaintext_sequences = tokenizer.texts_to_sequences(plaintexts)

# get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# pad the sequences
max_len = max(len(seq) for seq in ciphertext_sequences + plaintext_sequences)
max_len = min(max_len, 100)

In [20]:
ciphertext_sequences = pad_sequences(ciphertext_sequences, maxlen=max_len, padding='post')
plaintext_sequences = pad_sequences(plaintext_sequences, maxlen=max_len, padding='post')
y_input = pad_sequences(plaintext_sequences[:, :-1], maxlen=max_len, padding='post', value=0)  
y_output = pad_sequences(plaintext_sequences[:, 1:], maxlen=max_len, padding='post', value=0)

Actual Training
- Implemented Early Stopping for when the accuracy plateus
- teacher stopping 

In [21]:
# build and compile the model
model = build_transformer_model(vocab_size, max_len)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [22]:
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,              # num of epochs with no improvement after which training will be stopped
    restore_best_weights=True
)

tensorboard_callback = TensorBoard(log_dir="./logs")

# teacher forcing and earlyStopping
model.fit(
    [ciphertext_sequences, y_input], 
    y_output, 
    epochs=10,
    batch_size=16, 
    validation_split=0.2, 
    callbacks=[early_stopping, tensorboard_callback]
)

Epoch 1/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1781s[0m 521ms/step - accuracy: 0.5479 - loss: 1.5851 - val_accuracy: 0.9973 - val_loss: 0.0099
Epoch 2/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1745s[0m 515ms/step - accuracy: 0.9972 - loss: 0.0104 - val_accuracy: 0.9983 - val_loss: 0.0058
Epoch 3/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1739s[0m 513ms/step - accuracy: 0.9983 - loss: 0.0063 - val_accuracy: 0.9985 - val_loss: 0.0053
Epoch 4/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1730s[0m 510ms/step - accuracy: 0.9986 - loss: 0.0051 - val_accuracy: 0.9988 - val_loss: 0.0042
Epoch 5/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1585s[0m 467ms/step - accuracy: 0.9987 - loss: 0.0043 - val_accuracy: 0.9989 - val_loss: 0.0039
Epoch 6/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1522s[0m 449ms/step - accuracy: 0.9989 - loss: 0.0040 - val_accuracy: 0.9990 - val

<keras.src.callbacks.history.History at 0x17ac28c90>

## After Training Metrics

In [23]:
# need to add

## Testing with Sample Input

In [24]:
import numpy as np

sample_ciphertext = "LYC EOJZYPA LJC FJUDP LYR JKNTCC FLD GPYFDQ UYPAQTR EOJZYPA JF FLD OTPQTP EJOOJQYGR JPQ FLD NTZJO JOWDNF LJOO."
sample_input_seq = tokenizer.texts_to_sequences([sample_ciphertext])
sample_input_seq = pad_sequences(sample_input_seq, maxlen=max_len, padding='post')

start_token = 0  
decoder_input_seq = np.zeros((1, max_len))
decoder_input_seq[0, 0] = start_token

predicted_text = ""
for i in range(1, max_len):
    predicted_output = model.predict([sample_input_seq, decoder_input_seq])
    next_token = np.argmax(predicted_output[0, i-1, :])
    
    decoder_input_seq[0, i] = next_token
    
    # Debugging: Print token and corresponding character
    print(f"Step {i}: Token={next_token}, Char={tokenizer.sequences_to_texts([[next_token]])[0]}")
    
    predicted_char = tokenizer.sequences_to_texts([[next_token]])[0]
    predicted_text += predicted_char

print("Predicted Text:", predicted_text)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 923ms/step
Step 1: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Step 2: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Step 3: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Step 4: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Step 5: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Step 6: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Step 7: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Step 8: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Step 9: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Step 10: Token=0, Char=
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[