In [11]:
import tensorflow
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TimeDistributed
import numpy as np
import pandas as pd


In [12]:
# pruned data
df = pd.read_csv('/Users/shreyanakum/Downloads/aristocrat_model/Aristocrat-Model/data_collection/pruned_substitution_cipher_dataset.csv')

ciphertexts = df['ciphertext']
plaintexts = df['plaintext']

In [13]:
# tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(ciphertexts + plaintexts)

# convert text to sequences of integers
ciphertext_sequences = tokenizer.texts_to_sequences(ciphertexts)
plaintext_sequences = tokenizer.texts_to_sequences(plaintexts)

# get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# pad the sequences
max_len = max(len(seq) for seq in ciphertext_sequences + plaintext_sequences)
max_len = min(max_len, 100)

ciphertext_sequences = pad_sequences(ciphertext_sequences, maxlen=max_len, padding='post')
plaintext_sequences = pad_sequences(plaintext_sequences, maxlen=max_len, padding='post')

In [14]:
# prep labels (shifted by one for decoder training)
y = np.array(plaintext_sequences)
y_input = y[:, :-1]  # Inputs to the decoder (remove last token)
y_output = y[:, 1:]  # Targets for the decoder (remove first token)

# adjust target preparation to align with model output
y_output = pad_sequences(y_output, maxlen=max_len, padding='post', value=0)  # Ensure padding if needed

In [15]:
def build_transformer_model(vocab_size, seq_len, embed_dim=128, num_heads=4, ff_dim=512):
    inputs = Input(shape=(seq_len,))
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)

    # transformer block
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(embedding_layer, embedding_layer)
    attention_output = LayerNormalization()(attention_output + embedding_layer)
    
    ff_output = Dense(ff_dim, activation='relu')(attention_output)
    ff_output = Dense(embed_dim)(ff_output)
    ff_output = LayerNormalization()(ff_output + attention_output)

    # use TimeDistributed to apply dense to each time step
    outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(ff_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model

model = build_transformer_model(vocab_size, max_len)  # seq_len = max_len
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [16]:
from tensorflow.keras.callbacks import TensorBoard
tensorboard_callback = TensorBoard(log_dir="./logs")

# train the model
model.fit(ciphertext_sequences, y_output, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 79ms/step - accuracy: 0.2819 - loss: 2.5331 - val_accuracy: 0.2866 - val_loss: 2.4766
Epoch 2/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 79ms/step - accuracy: 0.2867 - loss: 2.4739 - val_accuracy: 0.2864 - val_loss: 2.4748
Epoch 3/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 78ms/step - accuracy: 0.2859 - loss: 2.4730 - val_accuracy: 0.2870 - val_loss: 2.4691
Epoch 4/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 76ms/step - accuracy: 0.2866 - loss: 2.4689 - val_accuracy: 0.2872 - val_loss: 2.4675
Epoch 5/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 76ms/step - accuracy: 0.2869 - loss: 2.4679 - val_accuracy: 0.2868 - val_loss: 2.4691
Epoch 6/10
[1m1834/1834[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 77ms/step - accuracy: 0.2871 - loss: 2.4657 - val_accuracy: 0.2868 - val_loss: 2.468

<keras.src.callbacks.history.History at 0x301909750>

## Testing with Sample Input

In [17]:
# 1. Prepare Sample Input
sample_text = ["KONNC, LYR VONICBO AC VUPUQORUL"]
sample_sequences = tokenizer.texts_to_sequences(sample_text)
sample_sequences = pad_sequences(sample_sequences, maxlen=max_len, padding='post')

# 2. Predict Using the Model
predictions = model.predict(sample_sequences)

# 3. Decode Predictions
import numpy as np

# Assuming vocab_size = 37 (adjust based on your setup)
index_to_char = {index: char for char, index in tokenizer.word_index.items()}
predicted_indices = np.argmax(predictions, axis=-1)

# Convert indices to text
predicted_text = ''.join([index_to_char.get(index, '') for index in predicted_indices[0]])
print(predicted_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 512ms/step
      a   a       a  a         
