In [53]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import keras_tuner as kt
import pathlib
import random
import pandas as pd

from credentials import SAMPLE_DB, FULL_DB
from librarian import SAMPLE_DATA_DIR

import encoders
import db_connect
import helpers

## Config
This notebook has a lot of options to adjust, most of which are controlled here.

In [54]:
DATA_DIR = SAMPLE_DATA_DIR
ENCODER = encoders.ENCODER_CAESAR
CHUNK_SIZE = 512
PROCESSING_UNITS = CHUNK_SIZE // 4
LAYER_UNITS = max(1, CHUNK_SIZE // 10)

INFER_TEXT = False
INFER_KEY = not INFER_TEXT

USE_CUSTOM_LOSS = True
USE_CUSTOM_OUTPUT_ACTIVATION = True

OUTPUT_MIN = 0
OUTPUT_MAX = len(encoders.CHARSET)-1
CUSTOM_LOSS_MODULO = OUTPUT_MAX+1

if INFER_TEXT:
    MAIN_ACCURACY_METRIC = "mae"
    LOSS_METRIC = "mean_squared_error"
    OUTPUT_SIZE = CHUNK_SIZE
    OPTIMIZER = "sgd"
else:
    MAIN_ACCURACY_METRIC = "mae"
    LOSS_METRIC = "mae"
    OPTIMIZER = "adamax"    

    if ENCODER == encoders.ENCODER_CAESAR:
        OUTPUT_SIZE = 1
    elif ENCODER == encoders.ENCODER_SUBST:
        OUTPUT_SIZE = len(encoders.CHARSET)
    else:
        raise Exception(f"Unsupported encoder {ENCODER}")

ENCRYPTED_FILE_LIMIT = 100 # -1 to disable limit

BASE_TRAIN_PCT = 0.75   # Start here. If train or test count would exceed the max, reduce it. Note 0.75 is the default.
MAX_TRAIN_COUNT = 100000 # -1 to disable (not recommended, things crash after about 100K)
MAX_TEST_COUNT =  100000 # -1 to disable (not recommended, things crash after about 100K)
SPLIT_SEED = 42

LOAD_BEST_MODEL = False # If False, a new model will be created from scratch
SAVE_BEST_MODEL = True
BEST_PATH = './saved_models/best.keras'

# Whether to run the tuner or the hard-coded network build code
TUNE_NETWORK = False
BUILD_NETWORK = not TUNE_NETWORK
TRAIN_MODEL = BUILD_NETWORK

TUNER_DIRECTORY = "tuner_projects"
TUNER_PROJECT_NAME = "KT"

EPOCHS = 50
BATCH_SIZE = 256 # Default is 32 -- going higher speeds things up a LOT, but may cause memory problems

# Data Retrieval and Structuring

In [55]:
# Get database IDs for encoders and key types

encoder_ids= {}
key_type_ids = {}

db = db_connect.DB(SAMPLE_DB)
with db.get_session() as session:
    for encoder in encoders.ALL_ENCODER_NAMES:
        id = db.get_encoder_id(session, encoder)
        encoder_ids[encoder] = id

    print(f"Encoder IDs: {encoder_ids}")

    for key_type in encoders.KEY_NAMES:
        id = db.get_key_type_id(session, key_type)
        key_type_ids[key_type] = id

    print(f"Key Type IDs: {key_type_ids}")

Encoder IDs: {'None': 1, 'Simplifier': 2, 'Caesar Cipher': 3, 'Substitution Cipher': 4, 'Enigma Machine': 5}
Key Type IDs: {'Character Offset': 1, 'Character Map': 2, 'Rotor Settings': 3}


In [56]:
# Map source ID to plaintext file (1) details, and source ID to corresponding ciphertext files (1+) details
sid_to_p = {}
sid_to_c = {}

cipher_id = encoder_ids[ENCODER]
with db.get_session() as session:
    # Get all files encrypted with the cipher we care about
    encrypted_files = db.get_files_by_source_and_encoder(session, -1, cipher_id)

    if len(encrypted_files) > ENCRYPTED_FILE_LIMIT and ENCRYPTED_FILE_LIMIT > 0:
        print(f"Found {len(encrypted_files)} encrypted files")
        encrypted_files = random.sample(encrypted_files, ENCRYPTED_FILE_LIMIT)
    print(f"Using {len(encrypted_files)} encrypted files")

    for c in encrypted_files:
        sid = c.source_id
    
        if sid not in sid_to_p:
            plaintext_ids = db.get_files_by_source_and_encoder(session, sid, encoder_ids[encoders.ENCODER_SIMPLIFIER])
            if len(plaintext_ids) != 1:
                raise Exception(f"Found {len(plaintext_ids)} plaintexts for source ID {sid}; should be exactly 1")
            sid_to_p[sid] = plaintext_ids[0]

        if sid not in sid_to_c:
            sid_to_c[sid] = []
        sid_to_c[sid].append(c)

len(sid_to_p), len(sid_to_c)

Found 720 encrypted files
Using 100 encrypted files


(18, 18)

In [57]:
# Build up the features (X, the cipher texts as offsets) and targets (y, either the plain texts as offsets OR the key).
# Note targets are not necessarily unique.
X = []
y = []

with db.get_session() as session:
    for sid in sid_to_p:
        if INFER_TEXT:
            plaintext = encoders.string_to_offsets(helpers.read_text_file(sid_to_p[sid].path))
            target_chunks = helpers.chunkify(plaintext, CHUNK_SIZE)    
    
        for c in sid_to_c[sid]:
            ciphertext = encoders.string_to_offsets(helpers.read_text_file(c.path))
            feature_chunks = helpers.chunkify(ciphertext, CHUNK_SIZE)

            if INFER_KEY:                
                if ENCODER == encoders.ENCODER_CAESAR:
                    key_value = float(db.get_key_by_id(session, c.key_id).value)
                elif ENCODER == encoders.ENCODER_SUBST:
                    raise Exception(f"Not yet implemented key for {ENCODER}")
                else:
                    raise Exception(f"Unsupported encoder {ENCODER}")
        
            for i in range (len(feature_chunks)):
                X.append(np.array(feature_chunks[i]).astype(float))

                if INFER_TEXT:
                    y.append(np.array(target_chunks[i]).astype(float))

                if INFER_KEY:
                    y.append(key_value)

X = np.array(X)
y = np.array(y)

X.shape, y.shape

((81215, 512), (81215,))

In [58]:
# Split the preprocessed data into a training and testing dataset
train_count = int(round(len(y) * BASE_TRAIN_PCT))
if train_count > MAX_TRAIN_COUNT and MAX_TRAIN_COUNT > -1:
    print(f"Train count would be {train_count}")
    train_count = int(MAX_TRAIN_COUNT)
print(f"Train count is {train_count}")

test_count = len(y) - train_count
if test_count > MAX_TEST_COUNT and MAX_TEST_COUNT > -1:
    print(f"Test count would be {test_count}")
    test_count = int(MAX_TEST_COUNT)
print(f"Test count is {test_count}")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_count, test_size=test_count, random_state=SPLIT_SEED)
print( "Initial counts: ", len(X), len(y), len(X_train), len(X_test), len(y_train), len(y_test) )

Train count is 60911
Test count is 20304
Initial counts:  81215 81215 60911 20304 60911 20304


In [59]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((60911, 512), (20304, 512))

# Tensorflow Callbacks
Custom layer activation function, loss and accuracy functions, and model save checkpoint.

In [60]:
# Custom output activation, forcing output to be within range -- but not rounding it off
# Possibly not needed, sigmoid + rescaler might work just as well
def modulo_output(x):
    return tf.math.mod(x, OUTPUT_MAX)

# Custom loss function, adapted from code generated by Copilot
def modulo_distance_loss(y_true, y_pred):
    """ Custom loss function to compute the modulo distance. 
    Args: 
        y_true: True values (ground truth). 
        y_pred: Predicted values. 
        modulo: The modulo value to apply -- hard coded.
    Returns: 
        The computed loss. 
    """ 
    # Compute the raw difference
    diff = tf.abs(y_true - y_pred)
    # Apply modulo operation to handle wrap-around cases
    mod_diff = tf.math.mod(diff, CUSTOM_LOSS_MODULO)
    # Ensure the distance is within the range [0, CUSTOM_LOSS_MODULO/2]
    loss = tf.minimum(mod_diff, CUSTOM_LOSS_MODULO - mod_diff) 
    return tf.reduce_mean(loss)

# Custom accuracy function, counterpart to the loss function above.
# Returns accuracy as 1 - (average percent distance from correct value)
def modulo_distance_accuracy(y_true, y_pred):
    diff = tf.abs(y_true - y_pred)
    mod_diff = tf.math.mod(diff, CUSTOM_LOSS_MODULO)
    loss = tf.minimum(mod_diff, CUSTOM_LOSS_MODULO - mod_diff)

    good_part = tf.math.subtract(CUSTOM_LOSS_MODULO / 2, loss)
    accuracy = tf.math.divide(good_part, CUSTOM_LOSS_MODULO / 2)

    return tf.reduce_mean(accuracy)

# Custom accuracy, percent of correct values after rounding and doing modulo division
def modulo_rounded_accuracy(y_true, y_pred):
    # y_true SHOULD all be round, in-bounds numbers but just in case...
    true_rounded = tf.math.round(y_true)
    true_mod = tf.math.mod(true_rounded, CUSTOM_LOSS_MODULO)

    # y_pred came straight from the model, so it needs to be rounded and mod'ed
    pred_rounded = tf.math.round(y_pred)
    pred_mod = tf.math.mod(pred_rounded, CUSTOM_LOSS_MODULO)

    # Count matches, as a percentage by averaging all the 0's and 1's
    matches_bool = tf.math.equal(true_mod, pred_mod)
    matches_float = tf.cast(matches_bool, tf.float64)
    return tf.reduce_mean(matches_float)

In [61]:
#!!! Scratch for testing my loss and accuracy functions
t_true = [[1.0, 2.0, 3.0, CUSTOM_LOSS_MODULO*5]]*2
t_pred = [[0.4, 1.5, 3.5, CUSTOM_LOSS_MODULO + 0.4]]*2

t_true_ts = tf.constant(np.array(t_true).astype(float))
t_pred_ts = tf.constant(np.array(t_pred).astype(float))
loss = modulo_distance_loss(t_true_ts, t_pred_ts)
accD = modulo_distance_accuracy(t_true_ts, t_pred_ts)
accR = modulo_rounded_accuracy(t_true_ts, t_pred_ts)
print("true:", t_true_ts)
print("pred:", t_pred_ts)
print("rond:", tf.math.round(t_pred_ts))
print("diff", abs(t_true_ts - t_pred_ts))
print("loss:", loss)
print("accD:", accD)
print("accR:", accR)


true: tf.Tensor(
[[  1.   2.   3. 310.]
 [  1.   2.   3. 310.]], shape=(2, 4), dtype=float64)
pred: tf.Tensor(
[[ 0.4  1.5  3.5 62.4]
 [ 0.4  1.5  3.5 62.4]], shape=(2, 4), dtype=float64)
rond: tf.Tensor(
[[ 0.  2.  4. 62.]
 [ 0.  2.  4. 62.]], shape=(2, 4), dtype=float64)
diff tf.Tensor(
[[  0.6   0.5   0.5 247.6]
 [  0.6   0.5   0.5 247.6]], shape=(2, 4), dtype=float64)
loss: tf.Tensor(0.5000000000000014, shape=(), dtype=float64)
accD: tf.Tensor(0.9838709677419354, shape=(), dtype=float64)
accR: tf.Tensor(0.5, shape=(), dtype=float64)


# Hyperband Tuning

In [62]:
GO_FAST = False

MAX_EPOCHS_PER_MODEL = 15 # Meant to get a decent idea of parameter, not create a final model. Behaves oddly below 3.
HYPERBAND_ITERATIONS = 1  # "Number of times to iterate over the full Hyperband algorithm"
EXECUTIONS_PER_TRIAL = 1  # Training from scratch
SEARCH_FIT_EPOCHS = 10    # Epochs for each attempt to do a fit, I think. Not sure how this relates to MAX_EPOCHS_PER_MODEL.
OVERWRITE = True         # I'm hoping to be able to interrupt a run and resume it later

CHOICES_PROCESSING_UNITS = [1, CHUNK_SIZE // 16, CHUNK_SIZE // 4, CHUNK_SIZE, CHUNK_SIZE * 2] # Prefers 128 (CHUNK_SIZE // 4)
CHOICES_FANCY_TOPO = ["NONE", "GRU", "RNN", "LSTM", "GRU-RNN", "GRU-LSTM", "GRU-RNN-LSTM"]     # LSTM seems to win
CHOICES_USE_SIGMOID = [True, False] # Prefers True
CHOICES_SIGMOID_SIZE_TO_OUTPUT = [True, False] # Prefers False
CHOICES_USE_SCALER = [True, False] # Only relevant when using Sigmoid -- prefers True
CHOICES_USE_OUTPUT_LIMITER = [True, False] # Prefers True
CHOICES_OPTIMIZER = ["adamax", "sgd", "RMSProp"] # Prefers adamax

#!!!
CHOICES_PROCESSING_UNITS = [1, CHUNK_SIZE // 16, CHUNK_SIZE // 4, CHUNK_SIZE, CHUNK_SIZE * 2]
CHOICES_FANCY_TOPO = ["LSTM"]
CHOICES_USE_SIGMOID = [True]
CHOICES_SIGMOID_SIZE_TO_OUTPUT = [False]
CHOICES_USE_SCALER = [True]
CHOICES_USE_OUTPUT_LIMITER = [True]
CHOICES_OPTIMIZER = ["adamax", "sgd", "RMSProp"]

if GO_FAST:
    MAX_EPOCHS_PER_MODEL = 3
    SEARCH_FIT_EPOCHS = 4
    CHOICES_FANCY_TOPO = ["NONE", "RNN"]
    CHOICES_PROCESSING_UNITS = [1, 8]
    CHOICES_OPTIMIZER = ["adamax"]

# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    processing_units = hp.Choice("Processing_Units", CHOICES_PROCESSING_UNITS)
    fancy_topo = hp.Choice("Fancy_Topology", CHOICES_FANCY_TOPO)
    use_sigmoid = hp.Choice("Sigmoid", CHOICES_USE_SIGMOID)    
    use_output_limiter = hp.Choice("Output_Limiter", CHOICES_USE_OUTPUT_LIMITER)
    optimizer = hp.Choice("Optimizer", CHOICES_OPTIMIZER)

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=CHUNK_SIZE, output_dim=processing_units, name="Embedding_Input"))

    if fancy_topo == "NONE":
        pass
    elif fancy_topo == "GRU":
        model.add(tf.keras.layers.GRU(PROCESSING_UNITS))
    elif fancy_topo == "RNN":
        model.add(tf.keras.layers.SimpleRNN(PROCESSING_UNITS))
    elif fancy_topo == "LSTM":
        model.add(tf.keras.layers.LSTM(PROCESSING_UNITS))
    elif fancy_topo == "GRU-RNN":
        model.add(tf.keras.layers.GRU(PROCESSING_UNITS, return_sequences=True))
        model.add(tf.keras.layers.SimpleRNN(PROCESSING_UNITS))
    elif fancy_topo == "GRU-LSTM":
        model.add(tf.keras.layers.GRU(PROCESSING_UNITS, return_sequences=True))
        model.add(tf.keras.layers.LSTM(PROCESSING_UNITS))
    elif fancy_topo == "GRU-RNN-LSTM":
        model.add(tf.keras.layers.GRU(PROCESSING_UNITS, return_sequences=True))
        model.add(tf.keras.layers.SimpleRNN(PROCESSING_UNITS, return_sequences=True))
        model.add(tf.keras.layers.LSTM(PROCESSING_UNITS))
    else:
        raise Exception(f"Bad choice {fancy_topo}")


    if use_sigmoid:
        # The sigmoid layer can be sized like a processing unit or for output,
        # but that only matters if those values are different
        if OUTPUT_SIZE != processing_units:
            # There are two possibilities, so allow checking both
            sigmoid_size_to_output = hp.Choice("Sigmoid_Size_To_Output", CHOICES_SIGMOID_SIZE_TO_OUTPUT)
            sigmoid_units = OUTPUT_SIZE if sigmoid_size_to_output else processing_units
        else:
            # The two values are the same, so just use that value
            sigmoid_units = OUTPUT_SIZE
    
        model.add(tf.keras.layers.Dense(units=processing_units, activation="sigmoid", name="Sigmoid"))
        use_scaler = hp.Choice("Scaler", CHOICES_USE_SCALER)
        if use_scaler:
            model.add(tf.keras.layers.Rescaling(scale=OUTPUT_MAX, offset=0, name="Rescaler")) # Input is 0-1
    
    if use_output_limiter:
        model.add(tf.keras.layers.Dense(OUTPUT_SIZE, activation=modulo_output, name="Output_Limiter"))
    else:
        model.add(tf.keras.layers.Dense(OUTPUT_SIZE, name="Linear_Output"))

    # Compile the model
    if USE_CUSTOM_LOSS:
        loss = modulo_distance_loss
        metrics = [modulo_distance_accuracy, modulo_rounded_accuracy]
    else:
        loss = LOSS_METRIC
        metrics = [MAIN_ACCURACY_METRIC]
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    
    return model


# Run the kerastuner search for best hyperparameters
if TUNE_NETWORK:

    if USE_CUSTOM_LOSS:
        objective = kt.Objective("val_loss", direction="min")
    else:
        objective = kt.Objective(f"val_{MAIN_ACCURACY_METRIC}", direction="max")

    tuner = kt.Hyperband(
        create_model,
        objective=objective,
        max_epochs=MAX_EPOCHS_PER_MODEL,
        hyperband_iterations=HYPERBAND_ITERATIONS,
        executions_per_trial=EXECUTIONS_PER_TRIAL,
        overwrite=OVERWRITE,
        directory=TUNER_DIRECTORY,
        project_name=TUNER_PROJECT_NAME)
    tuner.search(X_train_scaled, y_train, epochs=SEARCH_FIT_EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test_scaled,y_test))
    
    best_hyper = tuner.get_best_hyperparameters(1)[0]
    print(f"Best Hyper Values: {best_hyper.values}")
    
    nn = tuner.get_best_models(1)[0]
    eval_results = nn.evaluate(X_test_scaled, y_test, verbose=2, batch_size=BATCH_SIZE )
    print(f"Best Model Loss: {eval_results[0]}, Accuracy: {eval_results[1:]}")

    nn.save("./saved_models/tuned.keras")


# Model Reload /Creation

In [63]:
if LOAD_BEST_MODEL:
    print(f"Loading model from {BEST_PATH}")
    nn = tf.keras.models.load_model(BEST_PATH)
elif BUILD_NETWORK:
    print("Building new model")
    nn = tf.keras.models.Sequential()

    nn.add(tf.keras.layers.Embedding(input_dim=CHUNK_SIZE, output_dim=PROCESSING_UNITS, name="Embedding_Input"))
    nn.add(tf.keras.layers.LSTM(PROCESSING_UNITS, name="LSTM"))
    nn.add(tf.keras.layers.Dense(units=PROCESSING_UNITS, activation="sigmoid", name="Sigmoid"))
    nn.add(tf.keras.layers.Rescaling(scale=OUTPUT_MAX, offset=0, name="Rescaler")) # Input is 0-1
    nn.add(tf.keras.layers.Dense(OUTPUT_SIZE, activation=modulo_output, name="Output_Limiter"))

else:
    print("Nothing to do here. Hopefully you got a model somewhere above...")
        
# Check the structure of the model
nn.summary()

Building new model


# Model Training

In [64]:
%%time

if TRAIN_MODEL:
    # Training checkpoint to save after each epoch, if it is a new best model:
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=BEST_PATH,
        monitor="loss",
        mode="min",
        save_best_only=True,
        save_weights_only=False,
        verbose=1)

    print(f"Training model")

    if USE_CUSTOM_LOSS:
        loss = modulo_distance_loss
        metrics = [modulo_distance_accuracy, modulo_rounded_accuracy]
    else:
        loss = LOSS_METRIC
        metrics = [MAIN_ACCURACY_METRIC]
    
    if SAVE_BEST_MODEL:
        callbacks = [model_checkpoint_callback]
    else:
        callbacks = None
    
    # Compile the Sequential model together and customize metrics
    nn.compile(loss=loss, optimizer=OPTIMIZER, metrics=metrics)
    
    # Fit the model to the training data
    fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=callbacks, batch_size=BATCH_SIZE)

nn.summary()

Training model
Epoch 1/50
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 6.5491 - modulo_distance_accuracy: 0.5426 - modulo_rounded_accuracy: 0.0190
Epoch 1: loss improved from inf to 4.02030, saving model to ./saved_models/best.keras
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 47ms/step - loss: 6.5385 - modulo_distance_accuracy: 0.5425 - modulo_rounded_accuracy: 0.0190
Epoch 2/50
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.9590 - modulo_distance_accuracy: 0.5218 - modulo_rounded_accuracy: 0.0194
Epoch 2: loss improved from 4.02030 to 1.89493, saving model to ./saved_models/best.keras
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 47ms/step - loss: 1.9587 - modulo_distance_accuracy: 0.5218 - modulo_rounded_accuracy: 0.0194
Epoch 3/50
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.8013 - modulo_distance_accuracy: 0.5217 - modulo

CPU times: user 6min 9s, sys: 1min 33s, total: 7min 42s
Wall time: 9min 53s


In [67]:
# Evaluate the model using the test data

#!!! My custom metric functions are running out of memory for some reason...
if False:    
    print("Evaluating with model.predict() ...")
    raw_pred = nn.predict(X_test_scaled, batch_size=BATCH_SIZE)
    print("A...")
    y_pred = tf.constant(np.array(raw_pred).astype(np.float64))
    print("B...")
    loss = modulo_distance_loss(y_test, y_pred)
    loss = modulo_distance_loss(tf.constant(y_test), tf.constant(y_pred))
    print("C...")
    accuracy_distance = modulo_distance_accuracy(y_test, y_pred)
    print("D...")
    accuracy_rounded = modulo_rounded_accuracy(y_test, y_pred)
    print("E...")
    print(f"Loss: {loss:0.6}, Accuracy (Distance): {accuracy_distance:0.6}, Accuracy (Rounded): {accuracy_rounded:0.6}")

    pred_pd = pd.DataFrame(y_pred.numpy())
    print(pred_pd.describe())

print("Evaluating with model.evaluate() ...")
eval_results = nn.evaluate(X_test_scaled, y_test, verbose=2, batch_size=BATCH_SIZE)
print(f"Loss: {eval_results[0]}, Accuracy: {eval_results[1:]}")

Evaluating with model.evaluate() ...
80/80 - 2s - 19ms/step - loss: 1.2489 - modulo_distance_accuracy: 0.5229 - modulo_rounded_accuracy: 0.0253
Loss: 1.2489447593688965, Accuracy: [0.5228797197341919, 0.025341644883155823]


# Model Usefulness Spot-Check

In [68]:
def decode_chunks_with_model(chunks: list[list], model, scaler, input_already_scaled = True) -> list[list]:
    if input_already_scaled:
        return model.predict(chunks)
    else:
        return model.predict(scaler.transform(chunks))

def decode_text_with_model(ciphertext: str, model, scaler) -> str:
    offset_chunks = helpers.chunkify(encoders.string_to_offsets(ciphertext), CHUNK_SIZE)
    decoded_chunks = decode_chunks_with_model(offset_chunks, model, scaler, input_already_scaled = False)
    rounded = np.rint(decoded_chunks.flatten()).astype(int)
    return encoders.offsets_to_string(rounded)

def infer_key_with_model(ciphertext: str, model, scaler) -> int:
    chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)
    keys = model.predict(scaler.transform(chunks))
    key = int(round(np.median(keys)))
    return key

if INFER_TEXT:    
    CHUNKS_TO_CHECK = 2
else:
    CHUNKS_TO_CHECK = 20

cipher_file_db = sid_to_c[list(sid_to_c.keys())[0]][0]
ciphertext_path = cipher_file_db.path
ciphertext = helpers.read_text_file(ciphertext_path)
ciphertext = ciphertext[0:CHUNK_SIZE * CHUNKS_TO_CHECK]
    
if INFER_TEXT:    
    print("Decoded   : ", decode_text_with_model(ciphertext, nn, X_scaler))
if INFER_KEY:
    with db.get_session() as session:
        correct_key = int(db.get_key_by_id(session, cipher_file_db.key_id).value)
    print("Correct Key: ", correct_key)
    
    inferred_key = infer_key_with_model(ciphertext, nn, X_scaler)
    print("Inferred Key: ", inferred_key)

chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)
print(nn.predict(scaler.transform(chunks)))

Correct Key:  21
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Inferred Key:  42
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[[45.159256]
 [42.632664]
 [42.680344]
 [42.487907]
 [42.456577]
 [41.162918]
 [41.328995]
 [42.096886]
 [41.072803]
 [40.993828]
 [41.54249 ]
 [42.19113 ]
 [42.31713 ]
 [41.716343]
 [42.775177]
 [40.85258 ]
 [45.07912 ]
 [41.935387]
 [41.788406]
 [41.155766]]
