In [9]:
import tensorflow
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dense, Dropout, Add, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd


In [10]:
# pruned data
df = pd.read_csv('/Users/shreyanakum/Downloads/Sophomore Year/Summer 2024/Aristocrat-Model/data_collection/pruned_substitution_cipher_dataset_merged.csv')

ciphertexts = df['ciphertext']
plaintexts = df['plaintext']

In [11]:
# tokenize the sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(ciphertexts + plaintexts)

# convert text to sequences of integers
ciphertext_sequences = tokenizer.texts_to_sequences(ciphertexts)
plaintext_sequences = tokenizer.texts_to_sequences(plaintexts)

# get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# pad the sequences
max_len = max(len(seq) for seq in ciphertext_sequences + plaintext_sequences)
max_len = min(max_len, 100)

ciphertext_sequences = pad_sequences(ciphertext_sequences, maxlen=max_len, padding='post')
plaintext_sequences = pad_sequences(plaintext_sequences, maxlen=max_len, padding='post')

In [12]:
# prep labels (shifted by one for decoder training)
y = np.array(plaintext_sequences)
y_input = y[:, :-1]  # Inputs to the decoder (remove last token)
y_output = y[:, 1:]  # Targets for the decoder (remove first token)

# adjust target preparation to align with model output
y_output = pad_sequences(y_output, maxlen=max_len, padding='post', value=0)  # Ensure padding if needed

In [13]:
def build_transformer_model(vocab_size, seq_len, embed_dim=128, num_heads=4, ff_dim=512, num_layers=4, dropout_rate=0.1):
    inputs = Input(shape=(seq_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    
    for _ in range(num_layers):
        attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
        attention_output = Dropout(dropout_rate)(attention_output)
        attention_output = LayerNormalization()(Add()([x, attention_output]))
        
        # feed-forward network
        ff_output = Dense(ff_dim, activation='relu')(attention_output)
        ff_output = Dense(embed_dim)(ff_output)
        ff_output = Dropout(dropout_rate)(ff_output)
        x = LayerNormalization()(Add()([attention_output, ff_output]))
    
    # use TimeDistributed to apply Dense layer to each time step
    outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = build_transformer_model(vocab_size, max_len)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [14]:
from tensorflow.keras.callbacks import TensorBoard
tensorboard_callback = TensorBoard(log_dir="./logs")
tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)


# train the model
model.fit(ciphertext_sequences, y_output, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m803s[0m 235ms/step - accuracy: 0.2803 - loss: 2.5473 - val_accuracy: 0.2870 - val_loss: 2.4717
Epoch 2/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m788s[0m 232ms/step - accuracy: 0.2876 - loss: 2.4718 - val_accuracy: 0.2874 - val_loss: 2.4664
Epoch 3/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m707s[0m 209ms/step - accuracy: 0.2889 - loss: 2.4621 - val_accuracy: 0.2876 - val_loss: 2.4643
Epoch 4/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m704s[0m 208ms/step - accuracy: 0.2872 - loss: 2.4662 - val_accuracy: 0.2876 - val_loss: 2.4626
Epoch 5/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 202ms/step - accuracy: 0.2874 - loss: 2.4637 - val_accuracy: 0.2876 - val_loss: 2.4613
Epoch 6/10
[1m3391/3391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m679s[0m 200ms/step - accuracy: 0.2872 - loss: 2.4635 - val_accuracy: 0.2877 - val_loss:

<keras.src.callbacks.history.History at 0x30571f610>

## Testing with Sample Input

In [19]:
sample_text = ["OB ISZDPH *GQG EFBE KZE NZUZPJ SQQO ZE EQ EOFNN AKFA BQT YFP'A EKQTA FA AKD YFA VZAKQTA JDAAZPJ F OQTAKITN QI KFZS."]
# Hello world.
sample_sequences = tokenizer.texts_to_sequences(sample_text)
sample_sequences = pad_sequences(sample_sequences, maxlen=max_len, padding='post')
predictions = model.predict(sample_sequences)

# assuming vocab_size = 37
index_to_char = {index: char for char, index in tokenizer.word_index.items()}
predicted_indices = np.argmax(predictions, axis=-1)

predicted_text = '-'.join([index_to_char.get(index, '') for index in predicted_indices[0]])
print(predicted_text)
print('my friend *bob says his living room is so small that you can‘t shout at the cat without getting a mouthful of hair')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
 - - - -t- - - -t- - - - - - -t- - - - -t- - -t- - -t- - - - - -t- - - - -t- - - -t- - - -s- -t- - - - - -t- - -t- - - -t- - - -t- - - - - - - -t- - - - - - - -t- -t- - - - - - - - -t- - -t- - - - -
my friend *bob says his living room is so small that you can‘t shout at the cat without getting a mouthful of hair
