In [24]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import keras_tuner as kt
import pathlib
import random

from credentials import SAMPLE_DB, FULL_DB
from librarian import SAMPLE_DATA_DIR

import encoders
import db_connect
import helpers

In [34]:
DATA_DIR = SAMPLE_DATA_DIR
CHUNK_SIZE = 128
LAYER_UNITS = max(1, CHUNK_SIZE // 10)

INFER_TEXT = True
INFER_KEY = not INFER_TEXT

if INFER_TEXT:
    MAIN_ACCURACY_METRIC = "mae"
    GOOD_ACCURACY_DIRECTION = "min" # Some accuracy metrics go up for better results, some go down
    LOSS_METRIC = "mean_squared_error"
    OUTPUT_SIZE = CHUNK_SIZE
    OPTIMIZER = "sgd"
else:
    MAIN_ACCURACY_METRIC = "mae"
    GOOD_ACCURACY_DIRECTION = "min" # Some accuracy metrics go up for better results, some go down
    LOSS_METRIC = "mae"
    OUTPUT_SIZE = 1 # actually depends on cipher
    OPTIMIZER = "adamax"

ENCRYPTED_FILE_LIMIT = 10 # -1 to disable limit

BASE_TRAIN_PCT = 0.75   # Start here. If it exceed the max count, reduce it. Note 0.75 is the default.
MAX_TRAIN_COUNT = 100000 # -1 to disable
SPLIT_SEED = 42

LOAD_BEST_MODEL = False # If False, a new model will be created from scratch
SAVE_BEST_MODEL = False
TRAIN_MODEL = True
EPOCHS = 5

db = db_connect.DB(SAMPLE_DB)

In [26]:
# Get database IDs for encoders and key types

encoder_ids= {}
key_type_ids = {}

with db.get_session() as session:
    for encoder in encoders.ALL_ENCODER_NAMES:
        id = db.get_encoder_id(session, encoder)
        encoder_ids[encoder] = id

    print(f"Encoder IDs: {encoder_ids}")

    for key_type in encoders.KEY_NAMES:
        id = db.get_key_type_id(session, key_type)
        key_type_ids[key_type] = id

    print(f"Key Type IDs: {key_type_ids}")

Encoder IDs: {'None': 1, 'Simplifier': 2, 'Caesar Cipher': 3, 'Substitution Cipher': 4, 'Enigma Machine': 5}
Key Type IDs: {'Character Offset': 1, 'Character Map': 2, 'Rotor Settings': 3}


In [27]:
# Map source ID to plaintext file (1) details, and source ID to corresponding ciphertext files (1+) details
sid_to_p = {}
sid_to_c = {}

cipher_id = encoder_ids[encoders.ENCODER_CAESAR]
with db.get_session() as session:
    # Get all files encrypted with the cipher we care about
    encrypted_files = db.get_files_by_source_and_encoder(session, -1, cipher_id)

    if len(encrypted_files) > ENCRYPTED_FILE_LIMIT and ENCRYPTED_FILE_LIMIT > 0:
        encrypted_files = random.sample(encrypted_files, ENCRYPTED_FILE_LIMIT)

    for c in encrypted_files:
        sid = c.source_id
    
        if sid not in sid_to_p:
            plaintext_ids = db.get_files_by_source_and_encoder(session, sid, encoder_ids[encoders.ENCODER_SIMPLIFIER])
            if len(plaintext_ids) != 1:
                raise Exception(f"Found {len(plaintext_ids)} plaintexts for source ID {sid}; should be exactly 1")
            sid_to_p[sid] = plaintext_ids[0]

        if sid not in sid_to_c:
            sid_to_c[sid] = []
        sid_to_c[sid].append(c)

len(sid_to_p), len(sid_to_c)

(5, 5)

In [28]:
# Build up the features (X, the cipher texts as values) and targets (y, either the plain texts as values OR the key).
# Note targets are not necessarily unique.
X = []
y = []

with db.get_session() as session:
    for sid in sid_to_p:
        if INFER_TEXT:
            plaintext = helpers.read_text_file(sid_to_p[sid].path)
            target_chunks = helpers.string_to_bytes(plaintext, CHUNK_SIZE)    
    
        for c in sid_to_c[sid]:
            ciphertext = helpers.read_text_file(c.path)
            feature_chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)

            if INFER_KEY:
                key_value = float(db.get_key_by_id(session, c.key_id).value)
    
            for i in range (len(feature_chunks)):
                X.append(feature_chunks[i])

                if INFER_TEXT:
                    y.append(target_chunks[i])                

                if INFER_KEY:
                    y.append(key_value)

X = np.array(X)
y = np.array(y)

X.shape, y.shape, X[0].shape, y[0].shape, X[0], y[0]

((34472, 128),
 (34472, 128),
 (128,),
 (128,),
 array([58., 58., 71., 73., 70., 47., 76., 46., 60., 47., 39., 44., 80.,
        39., 47., 60., 34., 76., 73., 10., 47., 60., 73., 40., 39., 44.,
        73., 10., 34., 69., 39., 78., 10., 67., 46., 70., 79., 39., 34.,
        69., 47., 39., 75., 32., 60., 39., 70., 69., 67., 10., 69., 60.,
        58., 47., 10., 74., 75., 73., 10., 44., 76., 75., 60., 47., 39.,
        71., 73., 70., 70., 62., 73., 60., 34., 47., 10., 69., 63., 39.,
        75., 60., 34., 68., 39., 34., 75., 39., 32., 75., 75., 71., 38.,
        43., 43., 78., 78., 78., 41., 71., 63., 47., 71., 41., 69., 60.,
        75., 39., 45., 75., 32., 10., 74., 58., 62., 10., 67., 60., 39.,
        78., 34., 74., 39., 71., 73., 70., 47., 76., 46., 60.]),
 array([10., 10., 80., 82., 79., 68., 85., 67., 69., 68., 32., 66., 89.,
        32., 68., 69., 65., 85., 82., 73., 68., 69., 82., 44., 32., 66.,
        82., 73., 65., 78., 32., 87., 73., 76., 67., 79., 88., 32., 65.,
        78.

In [29]:
# Split the preprocessed data into a training and testing dataset
train_count = int(round(len(y) * BASE_TRAIN_PCT))
print(f"Train count would be {train_count}")
if train_count > MAX_TRAIN_COUNT and MAX_TRAIN_COUNT > -1:
    train_count = int(MAX_TRAIN_COUNT)
print(f"Train count is {train_count}")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_count, random_state=SPLIT_SEED)
print( len(X), len(y), len(X_train), len(X_test), len(y_train), len(y_test) )

Train count would be 25854
Train count is 25854
34472 34472 25854 8618 25854 8618


In [30]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((25854, 128), (8618, 128))

In [38]:
if LOAD_BEST_MODEL:
    print(f"Loading model from {BEST_PATH}")
    nn = tf.keras.models.load_model(BEST_PATH)
else:
    print("Building new model")
    in_shape = (CHUNK_SIZE,)
    nn = tf.keras.models.Sequential()

    try_RNN = True
    if try_RNN:
        nn.add(tf.keras.layers.Embedding(input_dim=CHUNK_SIZE, output_dim=OUTPUT_SIZE))
        nn.add(tf.keras.layers.LSTM(CHUNK_SIZE))
        nn.add(tf.keras.layers.Dense(units=CHUNK_SIZE, activation="relu"))
        nn.add(tf.keras.layers.Dense(OUTPUT_SIZE))

    if not try_RNN:
        # Input layer
        nn.add(tf.keras.Input(shape=in_shape))
        
        # Hidden layers
        activations = ["tanh", "relu", "elu", "exponential", "gelu", "mish", "relu6", "tanh", "selu"]
        unit_counts = [LAYER_UNITS]
        for u in unit_counts:
            for a in activations:
                nn.add(tf.keras.layers.Dense(units=u, activation=a))
        
        # Output layer
        nn.add(tf.keras.layers.Dense(units=OUTPUT_SIZE))
    
# Check the structure of the model
nn.summary()

Building new model


In [None]:
%%time

# Set up training checkpoint to save after each epoch, if it is a new best model:
BEST_PATH = './best.keras'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=BEST_PATH,
    monitor=MAIN_ACCURACY_METRIC,
    mode=GOOD_ACCURACY_DIRECTION,
    save_best_only=True,
    save_weights_only=False,
    verbose=1)

if TRAIN_MODEL:
    print(f"Training model")
    
    # Compile the Sequential model together and customize metrics
    nn.compile(loss=LOSS_METRIC, optimizer=OPTIMIZER, metrics=[MAIN_ACCURACY_METRIC])
    
    # Fit the model to the training data
    callbacks = None
    if SAVE_BEST_MODEL:
        callbacks = [model_checkpoint_callback]
    fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=callbacks)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Training model
Epoch 1/5
[1m808/808[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 954.7554 - mae: 23.9296
Epoch 2/5
[1m808/808[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 413.4129 - mae: 16.0348
Epoch 3/5
[1m808/808[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 403.2758 - mae: 15.7707
Epoch 4/5
[1m797/808[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - loss: 396.4476 - mae: 15.6169

In [33]:
def decode_chunks_with_model(chunks: list[list], model, scaler, input_already_scaled = True) -> list[list]:
    if input_already_scaled:
        return model.predict(chunks)
    else:
        return model.predict(scaler.transform(chunks))

def decode_text_with_model(ciphertext: str, model, scaler) -> str:
    chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)
    decoded_chunks = decode_chunks_with_model(chunks, model, scaler, input_already_scaled = False)
    return helpers.bytes_to_string(decoded_chunks)

def infer_key_with_model(ciphertext: str, model, scaler) -> int:
    chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)
    key = int(round(model.predict(scaler.transform(chunks))[0][0]))
    return key

cipher_file_db = sid_to_c[list(sid_to_c.keys())[0]][0]
ciphertext_path = cipher_file_db.path
ciphertext = helpers.read_text_file(ciphertext_path)
ciphertext = ciphertext[0:CHUNK_SIZE*2]
    
if INFER_TEXT:    
    print("Decoded   : ", decode_text_with_model(ciphertext, nn, X_scaler))
if INFER_KEY:
    with db.get_session() as session:
        correct_key = int(db.get_key_by_id(session, cipher_file_db.key_id).value)
    print("Correct Key: ", correct_key)
    
    inferred_key = infer_key_with_model(ciphertext, nn, X_scaler)
    print("Inferred Key: ", inferred_key)

nn.summary()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Decoded   :  CBBBBBACACCBACACBBBBABBCCCABBBCBBBBCACACBCDCABCBBCABBBBDCABCCACCBAADBBACABCCBADBBCBBCBACCABACCACBCBBBDBBCBBCD@CBCBCBACDBBDDCAACCCBBBBBACACCBACACBBBBABBCCCABBBCBBBBCACACBCDCABCBBCABBBBDCABCCACCBAADBBACABCCBADBBCBBCBACCABACCACBCBBBDBBCBBCD@CBCBCBACDBBDDCAACC
