In [28]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import keras_tuner as kt
import pathlib

from credentials import SAMPLE_DB, FULL_DB
from librarian import SAMPLE_DATA_DIR

import encoders
import db_connect
import helpers

In [29]:
DATA_DIR = SAMPLE_DATA_DIR
CHUNK_SIZE = 4096

ENCRYPTED_FILE_LIMIT = 2 # -1 to disable limit
SPLIT_SEED = 42
SPLIT_TEST_SIZE = 0.25     # 0.25 is default

db = db_connect.DB(SAMPLE_DB)

In [30]:
# Get database IDs for encoders and key types

encoder_ids= {}
key_type_ids = {}

with db.get_session() as session:
    for encoder in encoders.ALL_ENCODER_NAMES:
        id = db.get_encoder_id(session, encoder)
        encoder_ids[encoder] = id

    print(f"Encoder IDs: {encoder_ids}")

    for key_type in encoders.KEY_NAMES:
        id = db.get_key_type_id(session, key_type)
        key_type_ids[key_type] = id

    print(f"Key Type IDs: {key_type_ids}")

Encoder IDs: {'None': 1, 'Simplifier': 2, 'Caesar Cipher': 3, 'Substitution Cipher': 4, 'Enigma Machine': 5}
Key Type IDs: {'Character Offset': 1, 'Character Map': 2, 'Rotor Settings': 3}


In [31]:
# Map source ID to plaintext file (1) details, and source ID to corresponding ciphertext files (1+) details
sid_to_p = {}
sid_to_c = {}

cipher_id = encoder_ids[encoders.ENCODER_CAESAR]
with db.get_session() as session:
    # Get all files encrypted with the cipher we care about
    encrypted_files = db.get_files_by_source_and_encoder(session, -1, cipher_id)

    if len(encrypted_files) > ENCRYPTED_FILE_LIMIT and ENCRYPTED_FILE_LIMIT > 0:
        encrypted_files = encrypted_files[0:ENCRYPTED_FILE_LIMIT]

    for c in encrypted_files:
        sid = c.source_id
    
        if sid not in sid_to_p:
            plaintext_ids = db.get_files_by_source_and_encoder(session, sid, encoder_ids[encoders.ENCODER_SIMPLIFIER])
            if len(plaintext_ids) != 1:
                raise Exception(f"Found {len(plaintext_ids)} plaintexts for source ID {sid}; should be exactly 1")
            sid_to_p[sid] = plaintext_ids[0]

        if sid not in sid_to_c:
            sid_to_c[sid] = []
        sid_to_c[sid].append(c)

len(sid_to_p), len(sid_to_c)

(6, 6)

In [32]:
# Convert a string to a list of lists of numbers, broken up into chunks.
# If the length is not evenly divisible by chunk_size, the final chunk
# will overlap the previous one so the whole string gets converted.
def string_to_bytes(text, chunk_size) -> [[]]:
    if len(text) < chunk_size:
        raise Exception(f"Chunk size ({chunk_size}) must be no greater than text length ({len(text)})")

    chunks = []

    offset = 0
    while offset < len(text):
        if (offset + chunk_size >= len(text)):
            offset = len(text) - chunk_size
        
        # I'm sure there is a more optimal way to do this...
        encoded = text[offset : offset + chunk_size].encode('UTF-8')
        numbered = np.array([b for b in encoded])

        if len(encoded) != chunk_size or len(numbered) != chunk_size:
            raise Exception(f"Conversion chunk size error: {len(text)}, {len(encoded)}, {len(numbered)}, {chunk_size}")
        chunks.append(numbered)

        offset += chunk_size     
        
    return chunks

# Simple test cases
string_to_bytes("ABCDEFG", 1), string_to_bytes("ABCDEFG", 2), string_to_bytes("ABCDEFG", 6), string_to_bytes("ABCDEFG", 7)

([array([65]),
  array([66]),
  array([67]),
  array([68]),
  array([69]),
  array([70]),
  array([71])],
 [array([65, 66]), array([67, 68]), array([69, 70]), array([70, 71])],
 [array([65, 66, 67, 68, 69, 70]), array([66, 67, 68, 69, 70, 71])],
 [array([65, 66, 67, 68, 69, 70, 71])])

In [33]:
# Build up the features (X, the cipher texts as values) and targets (y, the plain texts as values).
# Note the targets can be repeated since a plaintext can be encrypted repeatedly by different keys.
X = []
y = []

for sid in sid_to_p:
    plaintext = helpers.read_text_file(os.path.normpath(sid_to_p[sid].path))
    target_chunks = string_to_bytes(plaintext, CHUNK_SIZE)

    for c in sid_to_c[sid]:
        ciphertext = helpers.read_text_file(os.path.normpath(c.path))
        feature_chunks = string_to_bytes(ciphertext, CHUNK_SIZE)

        if len(target_chunks) != len(feature_chunks):
            raise Exception(f"Chunk count mismatch; {len(target_chunks)} != {len(feature_chunks)}")

        for i in range (len(target_chunks)):
            X.append(feature_chunks[i])
            y.append(target_chunks[i])

X = np.array(X)
y = np.array(y)

X.shape, y.shape, X[0].shape, y[0].shape

((1818, 4096), (1818, 4096), (4096,), (4096,))

In [34]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SPLIT_TEST_SIZE, random_state=SPLIT_SEED)
print( len(X), len(y), len(X_train), len(X_test), len(y_train), len(y_test) )

1818 1818 1363 455 1363 455


In [35]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((1363, 4096), (455, 4096))

In [36]:
in_shape = (CHUNK_SIZE,)
nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.Input(shape=in_shape))

# Hidden layers
activations = ["tanh", "relu", "selu", "elu", "exponential"]
unit_counts = [CHUNK_SIZE*2]
for u in unit_counts:
    for a in activations:
        nn.add(tf.keras.layers.Dense(units=u, activation=a))

# Output layer
nn.add(tf.keras.layers.Dense(units=CHUNK_SIZE))

# Check the structure of the model
nn.summary()

I0000 00:00:1732074710.742625     419 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1732074710.882879     419 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1732074710.882938     419 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1732074710.885789     419 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1732074710.885831     419 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [38]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="mean_squared_error", metrics=["mae"])

# Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=3)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/3


I0000 00:00:1732074713.846390     773 service.cc:146] XLA service 0x7f27e8017c20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732074713.846441     773 service.cc:154]   StreamExecutor device (0): NVIDIA RTX 2000 Ada Generation Laptop GPU, Compute Capability 8.9
2024-11-19 19:51:53.905462: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-19 19:51:54.029461: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 90101






















[1m 3/43[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 62ms/step - loss: nan - mae: nan                                                   

I0000 00:00:1732074721.557613     773 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m42/43[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 55ms/step - loss: nan - mae: nan





















[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 357ms/step - loss: nan - mae: nan
Epoch 2/3
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - loss: nan - mae: nan
Epoch 3/3
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - loss: nan - mae: nan
15/15 - 6s - 379ms/step - loss: nan - mae: nan
Loss: nan, Accuracy: nan
