In [23]:
import tensorflow
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dense, Dropout, Add, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd


In [24]:
# pruned data
df = pd.read_csv('/Users/shreyanakum/Downloads/aristocrat_model/Aristocrat-Model/data_collection/pruned_substitution_cipher_dataset.csv')

ciphertexts = df['ciphertext']
plaintexts = df['plaintext']

In [25]:
# tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(ciphertexts + plaintexts)

# convert text to sequences of integers
ciphertext_sequences = tokenizer.texts_to_sequences(ciphertexts)
plaintext_sequences = tokenizer.texts_to_sequences(plaintexts)

# get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# pad the sequences
max_len = max(len(seq) for seq in ciphertext_sequences + plaintext_sequences)
max_len = min(max_len, 100)

ciphertext_sequences = pad_sequences(ciphertext_sequences, maxlen=max_len, padding='post')
plaintext_sequences = pad_sequences(plaintext_sequences, maxlen=max_len, padding='post')

In [26]:
# prep labels (shifted by one for decoder training)
y = np.array(plaintext_sequences)
y_input = y[:, :-1]  # Inputs to the decoder (remove last token)
y_output = y[:, 1:]  # Targets for the decoder (remove first token)

# adjust target preparation to align with model output
y_output = pad_sequences(y_output, maxlen=max_len, padding='post', value=0)  # Ensure padding if needed

In [27]:
def build_transformer_model(vocab_size, seq_len, embed_dim=128, num_heads=4, ff_dim=512, num_layers=4, dropout_rate=0.1):
    inputs = Input(shape=(seq_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    
    for _ in range(num_layers):
        attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
        attention_output = Dropout(dropout_rate)(attention_output)
        attention_output = LayerNormalization()(Add()([x, attention_output]))
        
        # feed-forward network
        ff_output = Dense(ff_dim, activation='relu')(attention_output)
        ff_output = Dense(embed_dim)(ff_output)
        ff_output = Dropout(dropout_rate)(ff_output)
        x = LayerNormalization()(Add()([attention_output, ff_output]))
    
    # use TimeDistributed to apply Dense layer to each time step
    outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = build_transformer_model(vocab_size, max_len)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [28]:
from tensorflow.keras.callbacks import TensorBoard
tensorboard_callback = TensorBoard(log_dir="./logs")

# train the model
model.fit(ciphertext_sequences, y_output, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 243ms/step - accuracy: 0.2774 - loss: 2.5778 - val_accuracy: 0.2861 - val_loss: 2.4824
Epoch 2/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 243ms/step - accuracy: 0.2854 - loss: 2.4879 - val_accuracy: 0.2872 - val_loss: 2.4727
Epoch 3/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 242ms/step - accuracy: 0.2866 - loss: 2.4748 - val_accuracy: 0.2861 - val_loss: 2.4700
Epoch 4/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 241ms/step - accuracy: 0.2864 - loss: 2.4723 - val_accuracy: 0.2870 - val_loss: 2.4680
Epoch 5/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 244ms/step - accuracy: 0.2867 - loss: 2.4694 - val_accuracy: 0.2872 - val_loss: 2.4687
Epoch 6/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 244ms/step - accuracy: 0.2866 - loss: 2.4688 - val_accuracy: 0.2873 - val_loss:

<keras.src.callbacks.history.History at 0x16c778d10>

## Testing with Sample Input

In [30]:
# 1. Prepare Sample Input
sample_text = ["UYB FPGY WOJB FX YFOHF O EGYQUYYGXZ OTXUF FPB BEGFY KOEB AHXK FPGY GW OEEHBYY"]
sample_sequences = tokenizer.texts_to_sequences(sample_text)
sample_sequences = pad_sequences(sample_sequences, maxlen=max_len, padding='post')

# 2. Predict Using the Model
predictions = model.predict(sample_sequences)

# 3. Decode Predictions
import numpy as np

# Assuming vocab_size = 37 (adjust based on your setup)
index_to_char = {index: char for char, index in tokenizer.word_index.items()}
predicted_indices = np.argmax(predictions, axis=-1)

# Convert indices to text
predicted_text = ''.join([index_to_char.get(index, '') for index in predicted_indices[0]])
print(predicted_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
   t    t    t  t     t t          t     t   t     t    t    t    t  t       
