In [39]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import keras_tuner as kt

from credentials import SAMPLE_DB, FULL_DB
from librarian import SAMPLE_DATA_DIR

import encoders
import db_connect
import helpers

In [40]:
DATA_DIR = SAMPLE_DATA_DIR
CHUNK_SIZE = 4096

SPLIT_SEED = 42
SPLIT_TEST_SIZE = 0.25     # 0.25 is default

db = db_connect.DB(SAMPLE_DB)

In [41]:
# Get database IDs for encoders and key types

encoder_ids= {}
key_type_ids = {}

with db.get_session() as session:
    for encoder in encoders.ALL_ENCODER_NAMES:
        id = db.get_encoder_id(session, encoder)
        encoder_ids[encoder] = id

    print(f"Encoder IDs: {encoder_ids}")

    for key_type in encoders.KEY_NAMES:
        id = db.get_key_type_id(session, key_type)
        key_type_ids[key_type] = id

    print(f"Key Type IDs: {key_type_ids}")

Encoder IDs: {'None': 1, 'Simplifier': 2, 'Caesar Cipher': 3, 'Substitution Cipher': 4, 'Enigma Machine': 5}
Key Type IDs: {'Character Offset': 1, 'Character Map': 2, 'Rotor Settings': 3}


In [42]:
# Map source ID to plaintext file (1) details, and source ID to corresponding ciphertext files (1+) details
sid_to_p = {}
sid_to_c = {}

cipher_id = encoder_ids[encoders.ENCODER_CAESAR]
with db.get_session() as session:
    # Get all files encrypted with the cipher we care about
    encrypted_files = db.get_files_by_source_and_encoder(session, -1, cipher_id)

    for c in encrypted_files:
        sid = c.source_id
    
        if sid not in sid_to_p:
            plaintext_ids = db.get_files_by_source_and_encoder(session, sid, encoder_ids[encoders.ENCODER_SIMPLIFIER])
            if len(plaintext_ids) != 1:
                raise Exception(f"Found {len(plaintext_ids)} plaintexts for source ID {sid}; should be exactly 1")
            sid_to_p[sid] = plaintext_ids[0]

        if sid not in sid_to_c:
            sid_to_c[sid] = []
        sid_to_c[sid].append(c)

len(sid_to_p), len(sid_to_c)

(7, 7)

In [43]:
# Convert a string to a list of lists of numbers, broken up into chunks.
# If the length is not evenly divisible by chunk_size, the final chunk
# will overlap the previous one so the whole string gets converted.
def string_to_bytes(text, chunk_size) -> [[]]:
    if len(text) < chunk_size:
        raise Exception(f"Chunk size ({chunk_size}) must be no greater than text length ({len(text)})")

    chunks = []

    offset = 0
    while offset < len(text):
        if (offset + chunk_size >= len(text)):
            offset = len(text) - chunk_size
        
        # I'm sure there is a more optimal way to do this...
        encoded = text[offset : offset + chunk_size].encode('UTF-8')
        numbered = [b for b in encoded]

        if len(encoded) != chunk_size or len(numbered) != chunk_size:
            raise Exception(f"Conversion chunk size error: {len(text)}, {len(encoded)}, {len(numbered)}, {chunk_size}")
        chunks.append(numbered)

        offset += chunk_size     
        
    return chunks

# Simple test cases
string_to_bytes("ABCDEFG", 1), string_to_bytes("ABCDEFG", 2), string_to_bytes("ABCDEFG", 6), string_to_bytes("ABCDEFG", 7)

([[65], [66], [67], [68], [69], [70], [71]],
 [[65, 66], [67, 68], [69, 70], [70, 71]],
 [[65, 66, 67, 68, 69, 70], [66, 67, 68, 69, 70, 71]],
 [[65, 66, 67, 68, 69, 70, 71]])

In [44]:
# Build up the features (X, the cipher texts as values) and targets (y, the plain texts as values).
# Note the targets can be repeated since a plaintext can be encrypted repeatedly by different keys.
X = []
y = []

for sid in sid_to_p:
    plaintext = helpers.read_text_file(sid_to_p[sid].path)
    target_chunks = string_to_bytes(plaintext, CHUNK_SIZE)

    for c in sid_to_c[sid]:
        ciphertext = helpers.read_text_file(c.path)
        feature_chunks = string_to_bytes(plaintext, CHUNK_SIZE)

        if len(target_chunks) != len(feature_chunks):
            raise Exception(f"Chunk count mismatch; {len(target_chunks)} != {len(feature_chunks)}")

        for i in range (len(target_chunks)):
            X.append(feature_chunks[i])
            y.append(target_chunks[i])


In [45]:
len(X), len(y), len(X[0]), len(y[0])

(2772, 2772, 4096, 4096)

In [46]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size=SPLIT_TEST_SIZE, random_state=SPLIT_SEED)
print( len(X), len(y), len(X_train), len(X_test), len(y_train), len(y_test) )

2772 2772 2079 693 2079 693


In [47]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((2079, 4096), (693, 4096))

In [51]:
in_shape = (CHUNK_SIZE,)
nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.Input(shape=in_shape))

# Hidden layers
activations = ["relu"]
unit_counts = [1]
for u in unit_counts:
    for a in activations:
        nn.add(tf.keras.layers.Dense(units=u, activation=a))

# Output layer
nn.add(tf.keras.layers.Dense(units=CHUNK_SIZE))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mae"])

# Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=3)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")