In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import keras_tuner as kt
import pathlib
import random
import pandas as pd
import sys

from credentials import CONNECTION_INFO
from constants import *

import encoders
import db_connect
import helpers

2024-11-30 15:41:13.021688: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-30 15:41:13.029864: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-30 15:41:13.040191: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-30 15:41:13.043297: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 15:41:13.050702: I tensorflow/core/platform/cpu_feature_guar

## Config
This notebook has a lot of options to adjust, most of which are controlled here.

In [2]:
ENCODER = encoders.ENCODER_CAESAR
CHUNK_SIZE = 512 # Was 512
PROCESSING_UNITS = CHUNK_SIZE // 4

EXTRA_CHECKS = True # Whether to run some (potentially slow) debug checks

INFER_TEXT = False
INFER_KEY = not INFER_TEXT

USE_CUSTOM_METRICS = True
USE_CUSTOM_OUTPUT_ACTIVATION = True

if INFER_TEXT:
    MAIN_ACCURACY_METRIC = "mae"
    LOSS_METRIC = "mean_squared_error"
    OUTPUT_SIZE = CHUNK_SIZE
    OPTIMIZER = "sgd"
else:
    MAIN_ACCURACY_METRIC = "mae"
    LOSS_METRIC = "mae"
    OPTIMIZER = "adamax"    

    if ENCODER == encoders.ENCODER_CAESAR:
        OUTPUT_SIZE = 1
    elif ENCODER == encoders.ENCODER_SUBST:
        OUTPUT_SIZE = len(encoders.CHARSET)
    else:
        raise Exception(f"Unsupported encoder {ENCODER}")

ENCRYPTED_FILE_LIMIT = -1 # -1 to disable limit

BASE_TRAIN_PCT = 0.75   # Start here. If train or test count would exceed the max, reduce it. Note 0.75 is the default.
MAX_TRAIN_COUNT = 1000 # -1 to disable
MAX_TEST_COUNT =  1000 # -1 to disable
SPLIT_SEED = 42

LOAD_BEST_MODEL = False # If False, a new model will be created from scratch
SAVE_BEST_MODEL = True
BEST_PATH = './saved_models/best.keras'

# Whether to run the tuner or the hard-coded network build code
TUNE_NETWORK = True
TUNE_QUICKLY = False # Set True to sanity check the model builder
BUILD_NETWORK = not TUNE_NETWORK
TRAIN_MODEL = BUILD_NETWORK

TUNER_DIRECTORY = "tuner_projects"
TUNER_PROJECT_NAME = "KT"

EPOCHS = 5
BATCH_SIZE = int(max(32, round(256 * (512/CHUNK_SIZE)))) # Default is 32 -- going higher speeds things up a LOT, but may cause memory problems
CHUNK_SIZE, PROCESSING_UNITS, BATCH_SIZE

(512, 128, 256)

# Data Retrieval and Structuring

In [3]:
# Get database IDs for encoders and key types

encoder_ids= {}
key_type_ids = {}

db = db_connect.DB(CONNECTION_INFO)
with db.get_session() as session:
    for encoder in encoders.ALL_ENCODER_NAMES:
        id = db.get_encoder_id(session, encoder)
        encoder_ids[encoder] = id

    print(f"Encoder IDs: {encoder_ids}")

    for key_type in encoders.KEY_NAMES:
        id = db.get_key_type_id(session, key_type)
        key_type_ids[key_type] = id

    print(f"Key Type IDs: {key_type_ids}")

Encoder IDs: {'None': 1, 'Simplifier': 2, 'Caesar Cipher': 3, 'Substitution Cipher': 4}
Key Type IDs: {'Character Offset': 1, 'Character Map': 2}


In [4]:
# Map source ID to plaintext file (1) details, and source ID to corresponding ciphertext files (1+) details
sid_to_p = {}
sid_to_c = {}

cipher_id = encoder_ids[ENCODER]
with db.get_session() as session:
    # Get all files encrypted with the cipher we care about
    encrypted_files = db.get_files_by_source_and_encoder(session, -1, cipher_id)

    if len(encrypted_files) > ENCRYPTED_FILE_LIMIT and ENCRYPTED_FILE_LIMIT > 0:
        print(f"Found {len(encrypted_files)} encrypted files")
        encrypted_files = random.sample(encrypted_files, ENCRYPTED_FILE_LIMIT)
    print(f"Using {len(encrypted_files)} encrypted files")

    for c in encrypted_files:
        sid = c.source_id
    
        if sid not in sid_to_p:
            plaintext_ids = db.get_files_by_source_and_encoder(session, sid, encoder_ids[encoders.ENCODER_SIMPLIFIER])
            if len(plaintext_ids) != 1:
                raise Exception(f"Found {len(plaintext_ids)} plaintexts for source ID {sid}; should be exactly 1")
            sid_to_p[sid] = plaintext_ids[0]

        if sid not in sid_to_c:
            sid_to_c[sid] = []
        sid_to_c[sid].append(c)

len(sid_to_p), len(sid_to_c)

Using 114 encrypted files


(19, 19)

In [5]:
# Build up the features (X, the cipher texts as offsets) and targets (y, either the plain texts as offsets OR the key).
# Note targets are not necessarily unique.
X = []
y = []

with db.get_session() as session:
    for sid in sid_to_p:
        if INFER_TEXT:
            plaintext = encoders.string_to_offsets(helpers.read_text_file(sid_to_p[sid].path))
            target_chunks = helpers.chunkify(plaintext, CHUNK_SIZE)    
    
        for c in sid_to_c[sid]:
            ciphertext = encoders.string_to_offsets(helpers.read_text_file(c.path))
            feature_chunks = helpers.chunkify(ciphertext, CHUNK_SIZE)

            if INFER_KEY:                
                if ENCODER == encoders.ENCODER_CAESAR:
                    key_value = float(db.get_key_by_id(session, c.key_id).value)

                    if EXTRA_CHECKS:
                        # Decode with the key we got from the DB, make sure it actually works
                        CHECK_CHANCE = 0.1
                        if random.random() < CHECK_CHANCE:
                            plaintext = encoders.string_to_offsets(helpers.read_text_file(sid_to_p[sid].path))
                            plainttext_str = encoders.offsets_to_string(plaintext)
                            ciphertext_str = encoders.offsets_to_string(ciphertext)
                            decoded_str = encoders.decode_caesar(ciphertext_str, int(key_value))
                            if decoded_str != plainttext_str:                                                    
                                print(decoded_str == plainttext_str)
                                print(decoded_str[0:128], plainttext_str[0:128])
                                raise Exception("Decode error")
                    
                elif ENCODER == encoders.ENCODER_SUBST:
                    key_str = db.get_key_by_id(session, c.key_id).value
                    key_value_ints = encoders.string_to_offsets(key_str)
                    key_value = np.array(key_value_ints).astype(float)

                else:
                    raise Exception(f"Unsupported encoder {ENCODER}")
        
            for i in range (len(feature_chunks)):
                X.append(np.array(feature_chunks[i]).astype(float))

                if INFER_TEXT:
                    y.append(np.array(target_chunks[i]).astype(float))

                if INFER_KEY:
                    y.append(key_value)

X = np.array(X)
y = np.array(y)

X.shape, y.shape, sys.getsizeof(X), sys.getsizeof(y)

((69396, 512), (69396,), 284246144, 555280)

In [6]:
# Split the preprocessed data into a training and testing dataset
train_count = int(round(len(y) * BASE_TRAIN_PCT))
if train_count > MAX_TRAIN_COUNT and MAX_TRAIN_COUNT > -1:
    print(f"Train count would be {train_count}")
    train_count = int(MAX_TRAIN_COUNT)
print(f"Train count is {train_count}")

test_count = len(y) - train_count
if test_count > MAX_TEST_COUNT and MAX_TEST_COUNT > -1:
    print(f"Test count would be {test_count}")
    test_count = int(MAX_TEST_COUNT)
print(f"Test count is {test_count}")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_count, test_size=test_count, random_state=SPLIT_SEED)
print( "Initial counts: ", len(X), len(y), len(X_train), len(X_test), len(y_train), len(y_test) )

# The pre-split data sets are no longer needed, and take up a lot of memory, so get rid of them
del X
del y

Train count would be 52047
Train count is 1000
Test count would be 68396
Test count is 1000
Initial counts:  69396 69396 1000 1000 1000 1000


In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((1000, 512), (1000, 512))

# Tensorflow Callbacks
Custom layer activation function, loss and accuracy functions, and model save checkpoint.

In [8]:
tf.keras.utils.get_custom_objects().clear()

from tf_helpers import modulo_output, modulo_distance_loss, modulo_distance_accuracy, modulo_rounded_accuracy, initialize_save_best

# Hyperband Tuning

In [9]:
import model_tuner

MAX_EPOCHS_PER_MODEL = 30 # Meant to get a decent idea of parameter, not create a final model. Behaves oddly below 3.
HYPERBAND_ITERATIONS = 2  # "Number of times to iterate over the full Hyperband algorithm"
EXECUTIONS_PER_TRIAL = 2  # Training from scratch
SEARCH_FIT_EPOCHS = 30    # Epochs for each attempt to do a fit, I think. Not sure how this relates to MAX_EPOCHS_PER_MODEL.
OVERWRITE = True          # I'm hoping to be able to interrupt a run and resume it later

input_shape = (CHUNK_SIZE,1)
mr_t = model_tuner.ModelTuner(input_shape, OUTPUT_SIZE, CHUNK_SIZE, BATCH_SIZE)

# All-encompassing optimization parameter choices. Do not try to use all of them at once...
mr_t.CHOICES_PROCESSING_UNITS = [1, CHUNK_SIZE // 16, CHUNK_SIZE // 4, CHUNK_SIZE, CHUNK_SIZE * 2]
mr_t.CHOICES_ACTIVATIONS = ["elu", "gelu", "hard_sigmoid", "hard_silu", "hard_swish", "leaky_relu", "linear", "log_softmax", "mish",
        "relu", "relu6", "selu", "sigmoid", "silu", "softmax", "softplus", "softsign", "swish", "tanh"]
mr_t.CHOICES_FANCY_TOPO = ["GRU", "RNN", "LSTM", "GRU-RNN", "GRU-LSTM", "GRU-RNN-LSTM"]
mr_t.CHOICES_USE_OUTPUT_LIMITER = [True, False] # Prefers True
mr_t.CHOICES_OPTIMIZER = ["adamax", "sgd", "RMSProp"]

# Narrow down the choices as needed.
mr_t.CHOICES_PROCESSING_UNITS = [1, CHUNK_SIZE // 4, CHUNK_SIZE]
mr_t.CHOICES_ACTIVATIONS = ["relu", "tanh", "sigmoid"]
mr_t.CHOICES_FANCY_TOPO = ["LSTM"]
mr_t.CHOICES_USE_OUTPUT_LIMITER = [True]
mr_t.CHOICES_OPTIMIZER = ["adamax"]

if TUNE_QUICKLY:
    MAX_EPOCHS_PER_MODEL = 3
    HYPERBAND_ITERATIONS = 1
    EXECUTIONS_PER_TRIAL = 1
    SEARCH_FIT_EPOCHS = 4

# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    return mr_t.CreateModel(hp)

# Run the kerastuner search for best hyperparameters
if TUNE_NETWORK:
    if USE_CUSTOM_METRICS:
        objective = kt.Objective("val_modulo_distance_accuracy", direction="max")
    else:
        objective = kt.Objective(f"val_{MAIN_ACCURACY_METRIC}", direction="max")

    tuner = kt.Hyperband(
        create_model,
        objective=objective,
        max_epochs=MAX_EPOCHS_PER_MODEL,
        hyperband_iterations=HYPERBAND_ITERATIONS,
        executions_per_trial=EXECUTIONS_PER_TRIAL,
        overwrite=OVERWRITE,
        directory=TUNER_DIRECTORY,
        project_name=TUNER_PROJECT_NAME)
    tuner.search(X_train_scaled, y_train, epochs=SEARCH_FIT_EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test_scaled,y_test))
    
    best_hyper = tuner.get_best_hyperparameters(1)[0]
    print(f"Best Hyper Values: {best_hyper.values}")
    
    nn = tuner.get_best_models(1)[0]
    eval_results = nn.evaluate(X_test_scaled, y_test, verbose=2, batch_size=BATCH_SIZE )
    print(f"Best Model Loss: {eval_results[0]}, Accuracy: {eval_results[1:]}")

    nn.save("./saved_models/tuned.keras")

Trial 23 Complete [00h 00m 13s]
val_modulo_distance_accuracy: 0.44579723477363586

Best val_modulo_distance_accuracy So Far: 0.5711937844753265
Total elapsed time: 00h 05m 16s
Best Hyper Values: {'Processing_Units': 512, 'Fancy_Topology': 'LSTM', 'Output_Limiter': 1, 'Optimizer': 'adamax', 'Chunk_Size': 512, 'Batch_Size': 256, 'Activation_A': 'sigmoid', 'Recurrent_Activation_A': 'sigmoid', 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 3, 'tuner/round': 0}


  saveable.load_own_variables(weights_store.get(inner_path))


4/4 - 2s - 378ms/step - loss: 2.1257 - modulo_distance_accuracy: 0.5744 - modulo_rounded_accuracy: 0.1574
Best Model Loss: 2.12571120262146, Accuracy: [0.5743776559829712, 0.15742860734462738]


# Model Reload /Creation

In [10]:
if BUILD_NETWORK:
    print("Building new model")
    nn = tf.keras.models.Sequential()

    NEW_ONE = True
    if NEW_ONE:

        input_shape = (CHUNK_SIZE,1)
        nn.add(tf.keras.Input(shape=input_shape))

        activation_A = "tanh"
        recurrent_activation_A = "sigmoid"
        activation_B = "tanh"
        recurrent_activation_B = "sigmoid"

        nn.add(tf.keras.layers.GRU(PROCESSING_UNITS, return_sequences=True, activation=activation_A, recurrent_activation=recurrent_activation_A))
        nn.add(tf.keras.layers.SimpleRNN(PROCESSING_UNITS, return_sequences=True))
        nn.add(tf.keras.layers.LSTM(PROCESSING_UNITS, activation=activation_A, recurrent_activation=recurrent_activation_A))
        
        nn.add(tf.keras.layers.Dense(OUTPUT_SIZE, activation=modulo_output, name="Output_Limiter"))

    else:
        # This was looking promising for a while...

        # Input layer
        nn.add(tf.keras.layers.Embedding(input_dim=CHUNK_SIZE, output_dim=PROCESSING_UNITS, name="Embedding_Input"))
    
        # This LSTM layer seems to do most of the real work
        nn.add(tf.keras.layers.LSTM(PROCESSING_UNITS, name="LSTM"))
    
        # Sigmoid layer produces an output between 0 and 1
        nn.add(tf.keras.layers.Dense(units=PROCESSING_UNITS, activation="sigmoid", name="Sigmoid"))
    
        # Rescale that 0-1 value from Sigmoid to the correct output range
        nn.add(tf.keras.layers.Rescaling(scale=OUTPUT_MAX, offset=0, name="Rescaler")) # Input is 0-1
    
        # Do modulo division to enforce output range limit
        # Note this seems like it should be totally redundant. But the Tuner results suggest that including both
        # mechanisms (Sigmoid + Rescaling, and Modulo Division) produces better results. I don't know why.
        nn.add(tf.keras.layers.Dense(OUTPUT_SIZE, activation=modulo_output, name="Output_Limiter"))

    # Check the structure of the model
    nn.summary()

# Model Training

In [11]:
# Reset the "best" score.
# If you're manually training iteratively, comment this out to preserve the best-ness:
model_checkpoint_callback = initialize_save_best(BEST_PATH)

# It can be helpful to load the best, train some more, and try to improve it:
if LOAD_BEST_MODEL:
    if os.path.exists(BEST_PATH):
        print(f"Loading model from {BEST_PATH}")
        nn = tf.keras.models.load_model(BEST_PATH,
            custom_objects={
                'modulo_distance_loss': modulo_distance_loss,
                'modulo_distance_accuracy': modulo_distance_accuracy,
                'modulo_rounded_accuracy': modulo_rounded_accuracy,
                'modulo_output': modulo_output
        })

# Train the model
if TRAIN_MODEL:
    # Decide what metrics to use
    if USE_CUSTOM_METRICS:
        loss = modulo_distance_loss
        metrics = [modulo_distance_accuracy, modulo_rounded_accuracy]
    else:
        loss = LOSS_METRIC
        metrics = [MAIN_ACCURACY_METRIC]

    print(nn.summary())
    print(f"Training model")
    
    if SAVE_BEST_MODEL:
        callbacks = [model_checkpoint_callback]
    else:
        callbacks = None
    
    # Compile the Sequential model together and customize metrics
    nn.compile(loss=loss, optimizer=OPTIMIZER, metrics=metrics)
    
    # Fit the model to the training data
    fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=callbacks, batch_size=BATCH_SIZE)

nn.summary()

In [14]:
# Evaluate the model using the test data

if USE_CUSTOM_METRICS and INFER_KEY:    
    print("Evaluating with model.predict() ...")
    raw_pred = nn.predict(X_test_scaled, batch_size=BATCH_SIZE)

    # My custom loss and accuracy functions are running out of memory for some reason...
    if False:
        y_pred = tf.constant(np.array(raw_pred).astype(np.float64))
        loss = modulo_distance_loss(y_test, y_pred)
        accuracy_distance = modulo_distance_accuracy(y_test, y_pred)
        accuracy_rounded = modulo_rounded_accuracy(y_test, y_pred)
        print(f"Loss: {loss:0.6}, Accuracy (Distance): {accuracy_distance:0.6}, Accuracy (Rounded): {accuracy_rounded:0.6}")

    if INFER_KEY:
        pred_pd = pd.DataFrame(raw_pred)
        print("Inferred key distribution:\n", pred_pd.describe())

print("Evaluating with model.evaluate() ...")
eval_results = nn.evaluate(X_test_scaled, y_test, verbose=2, batch_size=BATCH_SIZE)
print(f"With X_test_scaled, Loss: {eval_results[0]}, Accuracy: {eval_results[1:]}")

Evaluating with model.predict() ...
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Inferred key distribution:
                  0
count  1000.000000
mean      3.162483
std       0.005919
min       3.152110
25%       3.157394
50%       3.161571
75%       3.167225
max       3.175535
Evaluating with model.evaluate() ...
4/4 - 0s - 44ms/step - loss: 2.1257 - modulo_distance_accuracy: 0.5744 - modulo_rounded_accuracy: 0.1574
With X_test_scaled, Loss: 2.12571120262146, Accuracy: [0.5743776559829712, 0.15742860734462738]


# Model Usefulness Spot-Check

In [15]:
# These only really work for the Caesar Cipher

def decode_chunks_with_model(chunks: list[list], model, scaler, input_already_scaled = True) -> list[list]:
    if input_already_scaled:
        return model.predict(chunks)
    else:
        return model.predict(scaler.transform(chunks))

def decode_text_with_model(ciphertext: str, model, scaler) -> str:
    offset_chunks = helpers.chunkify(encoders.string_to_offsets(ciphertext), CHUNK_SIZE)
    decoded_chunks = decode_chunks_with_model(offset_chunks, model, scaler, input_already_scaled = False)
    rounded = np.rint(decoded_chunks.flatten()).astype(int)
    return encoders.offsets_to_string(rounded)

def infer_key_with_model(ciphertext: str, model, scaler) -> int:
    chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)
    keys = model.predict(scaler.transform(chunks))
    key = int(round(np.median(keys)))
    return key

if INFER_TEXT:    
    CHUNKS_TO_CHECK = 2
else:
    CHUNKS_TO_CHECK = 20

cipher_file_db = sid_to_c[list(sid_to_c.keys())[0]][0]
ciphertext_path = cipher_file_db.path
ciphertext = helpers.read_text_file(ciphertext_path)
ciphertext = ciphertext[0:CHUNK_SIZE * CHUNKS_TO_CHECK]

if INFER_TEXT:    
    print("Decoded   : ", decode_text_with_model(ciphertext, nn, X_scaler))
if INFER_KEY:
    with db.get_session() as session:
        correct_key = db.get_key_by_id(session, cipher_file_db.key_id).value
    print("Correct Key: ", correct_key)
    
    inferred_key = infer_key_with_model(ciphertext, nn, X_scaler)
    print("Inferred Key: ", inferred_key)
    
    chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)
    print(nn.predict(scaler.transform(chunks)))
    print(pd.DataFrame(nn.predict(scaler.transform(chunks))).describe())

Correct Key:  2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Inferred Key:  4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[[3.8891263]
 [3.8707552]
 [4.032809 ]
 [3.8695912]
 [3.9241219]
 [3.9508877]
 [3.7954497]
 [3.6289568]
 [3.9202857]
 [3.9562907]
 [3.8863006]
 [3.984178 ]
 [4.0592303]
 [3.8775334]
 [4.0644007]
 [3.691277 ]
 [3.9602046]
 [3.9429388]
 [4.0237765]
 [3.929731 ]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
               0
count  20.000000
mean    3.912892
std     0.110681
min     3.628957
25%     3.875839
50%     3.926926
75%     3.966198
max     4.064401
