In [26]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import keras_tuner as kt
import pathlib
import random
import pandas as pd
import sys

from credentials import CONNECTION_INFO
from constants import *

import encoders
import db_connect
import helpers

# Callbacks for use with TensorFlow
from tf_helpers import modulo_output, modulo_distance_loss, modulo_distance_accuracy, modulo_rounded_accuracy, initialize_save_best

## Config
This notebook has a lot of options to adjust, most of which are controlled here.

In [27]:
ENCODER = encoders.ENCODER_CAESAR
CHUNK_SIZE = 512 # Was 512
PROCESSING_UNITS = CHUNK_SIZE // 16 # Was // 4

EXTRA_CHECKS = True # Whether to run some (potentially slow) debug checks

INFER_TEXT = False # Text inferrence isn't really working. I don't know how to combine the time-distributed results.
INFER_KEY = not INFER_TEXT

if INFER_TEXT:
    MAIN_ACCURACY_METRIC = "mae"
    LOSS_METRIC = "mean_squared_error"
    OUTPUT_SIZE = CHUNK_SIZE
    OPTIMIZER = "sgd"
else:
    MAIN_ACCURACY_METRIC = "mae"
    LOSS_METRIC = "mae"
    OPTIMIZER = "adamax"    

    if ENCODER == encoders.ENCODER_CAESAR:
        OUTPUT_SIZE = 1
    elif ENCODER == encoders.ENCODER_SUBST:
        OUTPUT_SIZE = len(encoders.CHARSET)
    else:
        raise Exception(f"Unsupported encoder {ENCODER}")

ENCRYPTED_FILE_LIMIT = -1 # -1 to disable limit

BASE_TRAIN_PCT = 0.75   # Start here. If train or test count would exceed the max, reduce it. Note 0.75 is the default.
MAX_TRAIN_COUNT = -1 # -1 to disable; some setups start running out of memory around 100K
MAX_TEST_COUNT =  -1 # -1 to disable
SPLIT_SEED = 42

LOAD_BEST_MODEL = False # If False, a new model will be created from scratch
SAVE_BEST_MODEL = True
BEST_PATH = './saved_models/best.keras'

# Whether to run the tuner or the hard-coded network build code
TUNE_NETWORK = False
TUNE_QUICKLY = False # Set True to sanity check the model builder
BUILD_NETWORK = not TUNE_NETWORK
TRAIN_MODEL = BUILD_NETWORK and not LOAD_BEST_MODEL

TUNER_DIRECTORY = "tuner_projects"
TUNER_PROJECT_NAME = "KT"

EPOCHS = 50
BATCH_SIZE = int(max(32, round(256 * (512/CHUNK_SIZE)))) # Default is 32 -- going higher speeds things up a LOT, but may cause memory problems
SCALE = True

CHUNK_SIZE, PROCESSING_UNITS, BATCH_SIZE, OUTPUT_SIZE

(512, 32, 256, 1)

# Data Retrieval and Structuring

In [28]:
db = db_connect.DB(CONNECTION_INFO)

with db.get_session() as session:
    # Get database IDs for encoders and key types
    (encoder_ids, key_type_id) = db.get_id_maps(session)

    # Map source ID to plaintext file (1) details, and source ID to corresponding ciphertext files (1+) details
    (sid_to_p, sid_to_c) = db.get_source_maps(session, ENCRYPTED_FILE_LIMIT, encoder_ids[ENCODER], test_only=False)

    # Get the features (X, the cipher texts as offsets) and targets (y, either the plain texts as offsets OR the key).
    (X, y_keys, y_texts) = db.get_features_and_targets(session, sid_to_p, sid_to_c, ENCODER, CHUNK_SIZE)

X = np.array(X)
if INFER_KEY:
    y = np.array(y_keys)
if INFER_TEXT:
    y = np.array(y_texts)
        
len(sid_to_p), len(sid_to_c), X.shape, y.shape, sys.getsizeof(X), sys.getsizeof(y)

(35, 35, (95958, 512), (95958,), 393044096, 767776)

In [29]:
# Debugging...

# Get ALL the texts in one big string, for debugging
all_plaintexts = ""
all_ciphertexts = ""
for sid in sid_to_p:
    all_plaintexts += helpers.read_text_file(sid_to_p[sid].path)
    for c in sid_to_c[sid]:
        all_ciphertexts += helpers.read_text_file(c.path)

# Make sure specified text occurs somewhere in the texts.
# These raise exceptions if not found.
def check_in_plaintext(to_check: str):
    if to_check not in all_plaintexts:
        raise Exception(f"Plaintext not found: {to_check}")

def check_in_ciphertext(to_check: str):
    if to_check not in all_ciphertexts:
        raise Exception(f"Ciphertext not found: {to_check}")

if EXTRA_CHECKS:
    checks = round( len(X) * 0.01)
    print(f"Checking {checks} strings")
    for _ in range(checks):
        i = random.randint(0, len(X)-1)
        check_in_plaintext(encoders.offsets_to_string(y_texts[i].astype(int)))
        check_in_ciphertext(encoders.offsets_to_string(X[i].astype(int)))

len(all_plaintexts), len(all_ciphertexts)

Checking 960 strings


(8180147, 49080882)

In [30]:
# Split the preprocessed data into a training and testing dataset
# Note we have excluded "test_only" files above, they will be used for later validation.

train_count = int(round(len(y) * BASE_TRAIN_PCT))
if train_count > MAX_TRAIN_COUNT and MAX_TRAIN_COUNT > -1:
    print(f"Train count would be {train_count}")
    train_count = int(MAX_TRAIN_COUNT)
print(f"Train count is {train_count}")

test_count = len(y) - train_count
if test_count > MAX_TEST_COUNT and MAX_TEST_COUNT > -1:
    print(f"Test count would be {test_count}")
    test_count = int(MAX_TEST_COUNT)
print(f"Test count is {test_count}")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_count, test_size=test_count, random_state=SPLIT_SEED)

if EXTRA_CHECKS:
    checks = max(10, round( min(len(X_train), len(X_test)) * 0.01))
    print(f"Checking {checks} strings")
    for _ in range(checks):
        i = random.randint(0, len(X_train)-1)
        check_in_ciphertext(encoders.offsets_to_string(X_train[i].astype(int)))

        i = random.randint(0, len(X_test)-1)
        check_in_ciphertext(encoders.offsets_to_string(X_test[i].astype(int)))

# The pre-split data sets are no longer needed, and take up a lot of memory, so get rid of them
if not EXTRA_CHECKS:
    del X
    del y
    del y_keys
    del y_texts

Train count is 71968
Test count is 23990
Checking 240 strings


In [31]:
if SCALE:
    # Create a StandardScaler instances
    scaler = StandardScaler()
    
    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)
    
    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)    
else:
    # Leave the inputs unscaled
    X_train_scaled = X_train
    X_test_scaled = X_test

if SCALE and EXTRA_CHECKS:
    checks = max(10, round( min(len(X_train), len(X_test)) * 0.01))
    print(f"Checking {checks} strings")
    for _ in range(checks):
        i = random.randint(0, len(X_train_scaled)-1)
        scaled = X_train_scaled[i]
        unscaled = X_scaler.inverse_transform([scaled]).round().astype(int)[0]
        check_in_ciphertext(encoders.offsets_to_string(unscaled))

        i = random.randint(0, len(X_test_scaled)-1)
        scaled = X_test_scaled[i]
        unscaled = X_scaler.inverse_transform([scaled]).round().astype(int)[0]
        check_in_ciphertext(encoders.offsets_to_string(unscaled))
        
to_show = min(16, CHUNK_SIZE)
X_train_scaled.shape, X_test_scaled.shape, X_train_scaled[0][0:to_show], X_test_scaled[0][0:to_show]

Checking 240 strings


((71968, 512),
 (23990, 512),
 array([ 0.14185034, -0.21097266,  0.84917754,  0.49584046, -1.27538753,
        -0.21097319, -1.26778865,  1.20328198,  0.13332543, -0.91000704,
        -0.56356898, -1.27079797, -0.21048943,  0.49693745, -1.61955243,
        -0.91552221]),
 array([ 0.494275  ,  1.20021122,  0.14344914,  0.49584046, -0.92249056,
        -1.27108807,  0.495873  , -1.61867667, -1.27803955, -0.20391186,
         0.49319444, -0.56454613,  1.20133317,  0.49693745,  0.85075048,
         1.20181321]))

In [32]:
# Reshape the data as required for the model

print(f"Original shapes: {X_train.shape}, {X_test.shape}, {y_train.shape}, {y_test.shape}")

X_train = X_train.reshape((-1, CHUNK_SIZE, 1)) 
X_train_scaled = X_train_scaled.reshape((-1, CHUNK_SIZE, 1)) 
X_test = X_test.reshape((-1, CHUNK_SIZE, 1)) 
X_test_scaled = X_test_scaled.reshape((-1, CHUNK_SIZE, 1)) 
y_train = y_train.reshape((-1, OUTPUT_SIZE, 1)) 
y_test = y_test.reshape((-1, OUTPUT_SIZE, 1))

print(f"Final    shapes: {X_train.shape}, {X_train_scaled.shape}, {X_test.shape}, {X_test_scaled.shape}, {y_train.shape}, {y_test.shape}")

Original shapes: (71968, 512), (23990, 512), (71968,), (23990,)
Final    shapes: (71968, 512, 1), (71968, 512, 1), (23990, 512, 1), (23990, 512, 1), (71968, 1, 1), (23990, 1, 1)


# Hyperband Tuning

In [33]:
import model_tuner

MAX_EPOCHS_PER_MODEL = 20 # Meant to get a decent idea of parameter, not create a final model. Behaves oddly below 3.
HYPERBAND_ITERATIONS = 2  # "Number of times to iterate over the full Hyperband algorithm"
EXECUTIONS_PER_TRIAL = 2  # Training from scratch
SEARCH_FIT_EPOCHS = 20    # Epochs for each attempt to do a fit, I think. Not sure how this relates to MAX_EPOCHS_PER_MODEL.
OVERWRITE = True          # I'm hoping to be able to interrupt a run and resume it later

input_shape = (None, 1, CHUNK_SIZE)
mr_t = model_tuner.ModelTuner(input_shape, OUTPUT_SIZE, CHUNK_SIZE, BATCH_SIZE)

# All-encompassing optimization parameter choices. Do not try to use all of them at once...
mr_t.CHOICES_PROCESSING_UNITS = [1, CHUNK_SIZE // 16, CHUNK_SIZE // 4, CHUNK_SIZE, CHUNK_SIZE * 2]
mr_t.CHOICES_ACTIVATIONS = ["elu", "gelu", "hard_sigmoid", "hard_silu", "hard_swish", "leaky_relu", "linear", "log_softmax", "mish",
        "relu", "relu6", "selu", "sigmoid", "silu", "softmax", "softplus", "softsign", "swish", "tanh"]
mr_t.CHOICES_FANCY_TOPO = ["GRU", "RNN", "LSTM", "GRU-RNN", "GRU-LSTM", "GRU-RNN-LSTM"]
mr_t.CHOICES_USE_OUTPUT_LIMITER = [True, False] # Prefers True
mr_t.CHOICES_OPTIMIZER = ["adamax", "sgd", "RMSProp"]

# Narrow down the choices as needed.
mr_t.CHOICES_PROCESSING_UNITS = [1, 2, CHUNK_SIZE//16, CHUNK_SIZE//4, CHUNK_SIZE//2, CHUNK_SIZE, CHUNK_SIZE*2]
mr_t.CHOICES_ACTIVATIONS = ["tanh", "sigmoid"]
mr_t.CHOICES_FANCY_TOPO = ["LSTM"]
mr_t.CHOICES_USE_OUTPUT_LIMITER = [True]
mr_t.CHOICES_OPTIMIZER = ["adamax"]

if TUNE_QUICKLY:
    MAX_EPOCHS_PER_MODEL = 3
    HYPERBAND_ITERATIONS = 1
    EXECUTIONS_PER_TRIAL = 1
    SEARCH_FIT_EPOCHS = 4

# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    return mr_t.CreateModel(hp)

# Run the kerastuner search for best hyperparameters
if TUNE_NETWORK:
    if USE_CUSTOM_METRICS:
        objective = kt.Objective("modulo_distance_accuracy", direction="max")
    else:
        objective = kt.Objective(f"{MAIN_ACCURACY_METRIC}", direction="max")

    tuner = kt.Hyperband(
        create_model,
        objective=objective,
        max_epochs=MAX_EPOCHS_PER_MODEL,
        hyperband_iterations=HYPERBAND_ITERATIONS,
        executions_per_trial=EXECUTIONS_PER_TRIAL,
        overwrite=OVERWRITE,
        directory=TUNER_DIRECTORY,
        project_name=TUNER_PROJECT_NAME)
    tuner.search(X_train_scaled, y_train, epochs=SEARCH_FIT_EPOCHS, batch_size=BATCH_SIZE)
    
    best_hyper = tuner.get_best_hyperparameters(1)[0]
    print(f"Best Hyper Values: {best_hyper.values}")
    
    nn = tuner.get_best_models(1)[0]
    eval_results = nn.evaluate(X_test_scaled, y_test, verbose=2, batch_size=BATCH_SIZE )
    print(f"Best Model Loss: {eval_results[0]}, Accuracy: {eval_results[1:]}")

    nn.save("./saved_models/tuned.keras")

# Model Reload /Creation

In [34]:
if BUILD_NETWORK:
    print("Building new model")
    nn = tf.keras.models.Sequential()

    input_shape = (None, 1, CHUNK_SIZE)
    nn.add(tf.keras.Input(shape=input_shape[1:], name="Input_Layer"))

    activation_A = "tanh"
    recurrent_activation_A = "sigmoid"
    nn.add(tf.keras.layers.LSTM(
        PROCESSING_UNITS, return_sequences=True, activation=activation_A, recurrent_activation=recurrent_activation_A,
        name=f"A_LSTM_{activation_A}_{recurrent_activation_A}"))

    nn.add(tf.keras.layers.Dense(units = OUTPUT_SIZE, activation=modulo_output, name='Modulo_Layer'))

# Check the structure of the model
print(f"Input shape: {nn.input_shape}, Output shape: {nn.output_shape}")
print(nn.summary())

Building new model
Input shape: (None, 1, 512), Output shape: (None, 1, 1)


None


# Model Training

In [None]:
# Reset the "best" score and set up a callback to save the model as it improves during training.
# If you're manually training iteratively, comment this out to preserve the best-ness:
model_checkpoint_callback = initialize_save_best(BEST_PATH)

# It can be helpful to load the best, train some more, and try to improve it:
if LOAD_BEST_MODEL:
    if os.path.exists(BEST_PATH):
        print(f"Loading model from {BEST_PATH}")
        nn = tf.keras.models.load_model(BEST_PATH,
            custom_objects={
                'modulo_distance_loss': modulo_distance_loss,
                'modulo_distance_accuracy': modulo_distance_accuracy,
                'modulo_rounded_accuracy': modulo_rounded_accuracy,
                'modulo_output': modulo_output
        })

# Train the model
if TRAIN_MODEL:
    # Decide what metrics to use
    if USE_CUSTOM_METRICS:
        loss = modulo_distance_loss
        metrics = [modulo_distance_accuracy, modulo_rounded_accuracy]
    else:
        loss = LOSS_METRIC
        metrics = [MAIN_ACCURACY_METRIC]

    print(nn.summary())
    print(f"Training model")
    
    if SAVE_BEST_MODEL:
        callbacks = [model_checkpoint_callback]
    else:
        callbacks = None
    
    # Compile the Sequential model together and customize metrics
    nn.compile(loss=loss, optimizer=OPTIMIZER, metrics=metrics)
    
    # Fit the model to the training data
    fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=callbacks, batch_size=BATCH_SIZE)

print(f"Input shape: {nn.input_shape}, Output shape: {nn.output_shape}")
print(nn.summary())

None
Training model
Epoch 1/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 35ms/step - loss: 3.1342 - mae: 3.1342
Epoch 2/50
[1m  3/282[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 37ms/step - loss: 2.0141 - mae: 2.0141

  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - loss: 1.7428 - mae: 1.7428
Epoch 3/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - loss: 1.3499 - mae: 1.3499
Epoch 4/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - loss: 1.1644 - mae: 1.1644
Epoch 5/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - loss: 1.0479 - mae: 1.0479
Epoch 6/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - loss: 0.9906 - mae: 0.9906
Epoch 7/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - loss: 0.9484 - mae: 0.9484
Epoch 8/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 34ms/step - loss: 0.8917 - mae: 0.8917
Epoch 9/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 35ms/step - loss: 0.8200 - mae: 0.8200
Epoch 10/50
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3

In [None]:
# Evaluate the model using the test data
# Predicting the whole test set can take a lot of memory, so this can be used to limit it:
TEST_SET_SIZE = X_test_scaled.shape[0]
MAX_TEST_SUBSET = 1000
TEST_SUBSET_SIZE = min(TEST_SET_SIZE, MAX_TEST_SUBSET) if MAX_TEST_SUBSET > 0 else TEST_SET_SIZE
X_test_scaled_subset = X_test_scaled[0:TEST_SUBSET_SIZE, :, :]
y_test_subset = y_test[0:TEST_SUBSET_SIZE, :, :]

# Sometimes for troubleshooting I want to use the training set, which should produce more accurate predictions:
TRAIN_SET_SIZE = X_train_scaled.shape[0]
MAX_TRAIN_SUBSET = MAX_TEST_SUBSET
TRAIN_SUBSET_SIZE = min(TRAIN_SET_SIZE, MAX_TRAIN_SUBSET) if MAX_TRAIN_SUBSET > 0 else TRAIN_SET_SIZE
X_train_scaled_subset = X_train_scaled[0:TRAIN_SUBSET_SIZE, :, :]
y_train_subset = y_train[0:TRAIN_SUBSET_SIZE, :, :]

use_training_data = False
if use_training_data:
    print("Using training data as input; results are not valid for accuracy but may be informative about function")
    input = X_train_scaled_subset
    expected = y_train_subset
else:
    print("Using test data as input")
    input = X_test_scaled_subset
    expected = y_test_subset

# Trying to figure out where to actually get the keys...    
if INFER_KEY:
    # Check key accuracy... for test data, the key seems right but the texts don't quite line up.
    # Neither plain nor ciphertexts quite match what's in the files.
    # And at least in the test data, the key isn't right for the file
    print(f"Input: {input.shape}, Output: {expected.shape}")
    checks_to_do = 100
    for r in range(checks_to_do):
        which_one = random.randint(0, input.shape[0]-1)
        c_offsets_scaled = input[which_one,:,0]
        c_offsets = X_scaler.inverse_transform([c_offsets_scaled])[0,:].round().astype(int)
        c_str = encoders.offsets_to_string(c_offsets)
        k_float = expected[which_one, 0, 0]
        k = int(round(k_float))
        p = encoders.decode_caesar(c_str, k)
        check_in_plaintext(p)

    # Look for a good key in the output
    raw_predicted = nn.predict(input, batch_size=BATCH_SIZE)
    offsets = [0, CHUNK_SIZE//2, CHUNK_SIZE-1]
    for offset in offsets:
        predicted = raw_predicted[:, offset, 0].astype(np.float64)
        loss = modulo_distance_loss(expected, predicted)
        accuracy_distance = modulo_distance_accuracy(expected, predicted)
        accuracy_rounded = modulo_rounded_accuracy(expected, predicted)
        print(f"Offset: {offset:06}, Loss: {loss:0.6f}, Accuracy (Distance): {accuracy_distance:0.6f}, Accuracy (Rounded): {accuracy_rounded:0.6f}")
    

if True:
    if USE_CUSTOM_METRICS:
        print("Evaluating with model.predict() ...")    
        raw_pred = nn.predict(input, batch_size=BATCH_SIZE)
        
        print(f"raw_pred: {raw_pred.shape}")
        if INFER_KEY:
            # Average together all the predictions
            #!!! Not confident here
            #y_pred = np.array([np.mean(raw_pred[x,:,:]) for x in range(raw_pred.shape[0])]).astype(np.float64)
            y_pred = raw_pred[:, CHUNK_SIZE-1, 0].astype(np.float64)
        else:
            # This is probably wrong...
            y_pred = raw_pred[:,1,:]
        print(f"y_pred: {y_pred.shape}")
    
        # My custom loss and accuracy functions are running out of memory for decoded texts for some reason...
        if INFER_KEY:
            loss = modulo_distance_loss(expected, y_pred)
            accuracy_distance = modulo_distance_accuracy(expected, y_pred)
            accuracy_rounded = modulo_rounded_accuracy(expected, y_pred)
            print(f"Loss: {loss:0.6}, Accuracy (Distance): {accuracy_distance:0.6}, Accuracy (Rounded): {accuracy_rounded:0.6}")
    
        if INFER_KEY:
            print(f"y_pred: {y_pred.shape}, expected: {expected.shape}")
            pred_pd = pd.DataFrame(y_pred)
            true_pd = pd.DataFrame(expected[:,0,0])
            print("Inferred key distribution:\n", pred_pd.describe())
            print("Inferred key value counts:\n", pred_pd[0].round().value_counts())
            print("True key distribution    :\n", true_pd.describe())
            print("True key value counts    :\n", true_pd[0].round().value_counts())
    
    # I don't think evaluate() is handling the plethora of keys output by the model. These metrics are probably quite wrong:
    print("Evaluating with model.evaluate() ...")
    eval_results = nn.evaluate(X_test_scaled_subset, y_test_subset, verbose=2, batch_size=BATCH_SIZE)
    print(f"PROBABLY INCORRECT: Loss: {eval_results[0]}, Accuracy: {eval_results[1:]}")

# Model Usefulness Spot-Check

In [None]:
# These functions assume Caesar cipher, and need to be updated after shape change

def decode_chunks_with_model(chunks: list[list], model, scaler, input_already_scaled = True) -> list[list]:
    if input_already_scaled:
        return model.predict(chunks)
    else:
        return model.predict(scaler.transform(chunks))

def decode_text_with_model(ciphertext: str, model, scaler) -> str:
    offset_chunks = helpers.chunkify(encoders.string_to_offsets(ciphertext), CHUNK_SIZE)
    decoded_chunks = decode_chunks_with_model(offset_chunks, model, scaler, input_already_scaled = False)
    rounded = np.rint(decoded_chunks.flatten()).astype(int)
    return encoders.offsets_to_string(rounded)

def infer_key_with_model(ciphertext: str, model, scaler) -> int:
    chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)    
    scaled_chunks = scaler.transform(chunks)
    tmp_x = np.array(scaled_chunks).reshape((-1, CHUNK_SIZE, 1))
    keys = model.predict(tmp_x)
    key = np.mean(keys)
    return key

if INFER_TEXT:    
    CHUNKS_TO_CHECK = 2
else:
    CHUNKS_TO_CHECK = 20

cipher_file_db = sid_to_c[list(sid_to_c.keys())[0]][0]
ciphertext_path = cipher_file_db.path
ciphertext = helpers.read_text_file(ciphertext_path)
ciphertext = ciphertext[0:CHUNK_SIZE * CHUNKS_TO_CHECK]

if INFER_TEXT:    
    print("Decoded   : ", decode_text_with_model(ciphertext, nn, X_scaler))
if INFER_KEY:
    with db.get_session() as session:
        correct_key = db.get_key_by_id(session, cipher_file_db.key_id).value
    print("Correct Key: ", correct_key)
    
    inferred_key = infer_key_with_model(ciphertext, nn, X_scaler)
    print("Inferred Key: ", inferred_key)
    
    chunks = helpers.string_to_bytes(ciphertext, CHUNK_SIZE)
    scaled_chunks = scaler.transform(chunks)
    pred = nn.predict(scaled_chunks.reshape((-1, CHUNK_SIZE, 1)))

    pred_df = pd.DataFrame(pred.flatten())
    print(pred_df.describe(), pred_df[0].round().value_counts())
