In [38]:
import tensorflow
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dense, Dropout, Add, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd


In [39]:
# pruned data
df = pd.read_csv('/Users/shreyanakum/Downloads/aristocrat_model/Aristocrat-Model/data_collection/pruned_substitution_cipher_dataset_merged.csv')

ciphertexts = df['ciphertext']
plaintexts = df['plaintext']

In [40]:
# tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(ciphertexts + plaintexts)

# convert text to sequences of integers
ciphertext_sequences = tokenizer.texts_to_sequences(ciphertexts)
plaintext_sequences = tokenizer.texts_to_sequences(plaintexts)

# get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# pad the sequences
max_len = max(len(seq) for seq in ciphertext_sequences + plaintext_sequences)
max_len = min(max_len, 100)

ciphertext_sequences = pad_sequences(ciphertext_sequences, maxlen=max_len, padding='post')
plaintext_sequences = pad_sequences(plaintext_sequences, maxlen=max_len, padding='post')

In [41]:
# prep labels (shifted by one for decoder training)
y = np.array(plaintext_sequences)
y_input = y[:, :-1]  # Inputs to the decoder (remove last token)
y_output = y[:, 1:]  # Targets for the decoder (remove first token)

# adjust target preparation to align with model output
y_output = pad_sequences(y_output, maxlen=max_len, padding='post', value=0)  # Ensure padding if needed

In [42]:
def build_transformer_model(vocab_size, seq_len, embed_dim=128, num_heads=4, ff_dim=512, num_layers=4, dropout_rate=0.1):
    inputs = Input(shape=(seq_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    
    for _ in range(num_layers):
        attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
        attention_output = Dropout(dropout_rate)(attention_output)
        attention_output = LayerNormalization()(Add()([x, attention_output]))
        
        # feed-forward network
        ff_output = Dense(ff_dim, activation='relu')(attention_output)
        ff_output = Dense(embed_dim)(ff_output)
        ff_output = Dropout(dropout_rate)(ff_output)
        x = LayerNormalization()(Add()([attention_output, ff_output]))
    
    # use TimeDistributed to apply Dense layer to each time step
    outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = build_transformer_model(vocab_size, max_len)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [43]:
from tensorflow.keras.callbacks import TensorBoard
tensorboard_callback = TensorBoard(log_dir="./logs")
tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)


# train the model
model.fit(ciphertext_sequences, y_output, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m881s[0m 258ms/step - accuracy: 0.2792 - loss: 2.5489 - val_accuracy: 0.2872 - val_loss: 2.4712
Epoch 2/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m856s[0m 252ms/step - accuracy: 0.2875 - loss: 2.4718 - val_accuracy: 0.2873 - val_loss: 2.4673
Epoch 3/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m839s[0m 247ms/step - accuracy: 0.2878 - loss: 2.4660 - val_accuracy: 0.2876 - val_loss: 2.4640
Epoch 4/10
[1m  89/3391[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:51[0m 234ms/step - accuracy: 0.2885 - loss: 2.4559

KeyboardInterrupt: 

## Testing with Sample Input

In [44]:
sample_text = ["UYB FPGY WOJB FX YFOHF O EGYQUYYGXZ OTXUF FPB BEGFY KOEB AHXK FPGY GW OEEHBYY"]
sample_sequences = tokenizer.texts_to_sequences(sample_text)
sample_sequences = pad_sequences(sample_sequences, maxlen=max_len, padding='post')
predictions = model.predict(sample_sequences)

# assuming vocab_size = 37
index_to_char = {index: char for char, index in tokenizer.word_index.items()}
predicted_indices = np.argmax(predictions, axis=-1)

predicted_text = ''.join([index_to_char.get(index, '') for index in predicted_indices[0]])
print(predicted_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 675ms/step
   t    t    t  t     t t          t     t   t     t    t    t    t  t       
