### Character level RNN to decrypt text encoded with Caesar cipher

-> Preprocess data: Tokenization chars, padding sequences, mapping text to integers
-> Model design
-> predict

In [1]:
#Data loading
import os

def load_data(path):
    input_file = os.path.join(path)
    with open(input_file,'r') as f:
        data=f.read()
    
    return data.split('\n')



In [2]:
codes = load_data('datasets/cipher.txt')
plain_text = load_data('datasets/plaintext.txt')

In [4]:
print(codes[:3])
print(plain_text[:3])

['YMJ QNRJ NX MJW QJFXY QNPJI KWZNY , GZY YMJ GFSFSF NX RD QJFXY QNPJI .', 'MJ XFB F TQI DJQQTB YWZHP .', 'NSINF NX WFNSD IZWNSL OZSJ , FSI NY NX XTRJYNRJX BFWR NS STAJRGJW .']
['THE LIME IS HER LEAST LIKED FRUIT , BUT THE BANANA IS MY LEAST LIKED .', 'HE SAW A OLD YELLOW TRUCK .', 'INDIA IS RAINY DURING JUNE , AND IT IS SOMETIMES WARM IN NOVEMBER .']


In [8]:
#Data processing

from tensorflow.keras.preprocessing.text import Tokenizer

def tokenizer(x):
    x_tk = Tokenizer(char_level=True)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [13]:
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']

text_tokenized, text_tokenizer = tokenizer(text_sentences)


print('word indexes: ' + str(text_tokenizer.word_index))


for sample_i, (sent,token_sent) in enumerate(zip(text_sentences,text_tokenized)):
    print('\n')
    print('Sequence {} in x'.format(sample_i+1))
    print(' Input: {}'.format(sent))
    print('Output: {}'.format(token_sent))

word indexes: {' ': 1, 'e': 2, 'o': 3, 't': 4, 'i': 5, 's': 6, 'h': 7, 'r': 8, 'y': 9, 'u': 10, 'c': 11, 'n': 12, 'a': 13, 'p': 14, '.': 15, 'q': 16, 'k': 17, 'b': 18, 'w': 19, 'f': 20, 'x': 21, 'j': 22, 'm': 23, 'v': 24, 'l': 25, 'z': 26, 'd': 27, 'g': 28, ',': 29}


Sequence 1 in x
 Input: The quick brown fox jumps over the lazy dog .
Output: [4, 7, 2, 1, 16, 10, 5, 11, 17, 1, 18, 8, 3, 19, 12, 1, 20, 3, 21, 1, 22, 10, 23, 14, 6, 1, 3, 24, 2, 8, 1, 4, 7, 2, 1, 25, 13, 26, 9, 1, 27, 3, 28, 1, 15]


Sequence 2 in x
 Input: By Jove , my quick study of lexicography won a prize .
Output: [18, 9, 1, 22, 3, 24, 2, 1, 29, 1, 23, 9, 1, 16, 10, 5, 11, 17, 1, 6, 4, 10, 27, 9, 1, 3, 20, 1, 25, 2, 21, 5, 11, 3, 28, 8, 13, 14, 7, 9, 1, 19, 3, 12, 1, 13, 1, 14, 8, 5, 26, 2, 1, 15]


Sequence 3 in x
 Input: This is a short sentence .
Output: [4, 7, 5, 6, 1, 5, 6, 1, 13, 1, 6, 7, 3, 8, 4, 1, 6, 2, 12, 4, 2, 12, 11, 2, 1, 15]


In [14]:
#pad the data (post padding)
from tensorflow.keras.preprocessing.sequence import pad_sequences

def pad(x,length=None):
    if length is None:
        length=max([len(sentence) for sentence in x])
    return pad_sequences(x,maxlen=length,padding='post')



In [16]:
import numpy as np
test_pad = pad(text_tokenized)

for sample_i, (token_sent,pad_sent) in enumerate(zip(text_tokenized,test_pad)):
    print('\n')
    print('Sequence {} in x'.format(sample_i+1))
    print(' Input: {}'.format(np.array(token_sent)))
    print(' Output: {}'.format(pad_sent))



Sequence 1 in x
 Input: [ 4  7  2  1 16 10  5 11 17  1 18  8  3 19 12  1 20  3 21  1 22 10 23 14
  6  1  3 24  2  8  1  4  7  2  1 25 13 26  9  1 27  3 28  1 15]
 Output: [ 4  7  2  1 16 10  5 11 17  1 18  8  3 19 12  1 20  3 21  1 22 10 23 14
  6  1  3 24  2  8  1  4  7  2  1 25 13 26  9  1 27  3 28  1 15  0  0  0
  0  0  0  0  0  0]


Sequence 2 in x
 Input: [18  9  1 22  3 24  2  1 29  1 23  9  1 16 10  5 11 17  1  6  4 10 27  9
  1  3 20  1 25  2 21  5 11  3 28  8 13 14  7  9  1 19  3 12  1 13  1 14
  8  5 26  2  1 15]
 Output: [18  9  1 22  3 24  2  1 29  1 23  9  1 16 10  5 11 17  1  6  4 10 27  9
  1  3 20  1 25  2 21  5 11  3 28  8 13 14  7  9  1 19  3 12  1 13  1 14
  8  5 26  2  1 15]


Sequence 3 in x
 Input: [ 4  7  5  6  1  5  6  1 13  1  6  7  3  8  4  1  6  2 12  4  2 12 11  2
  1 15]
 Output: [ 4  7  5  6  1  5  6  1 13  1  6  7  3  8  4  1  6  2 12  4  2 12 11  2
  1 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


In [17]:
#preprocess pipeline

def preprocess(x,y):
    preprocess_x,x_tk = tokenizer(x)
    preprocess_y,y_tk = tokenizer(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    print('Original Y: ', preprocess_y.shape)

    #keras's sparse_categorical_crossentropy function requires the labels to be in 3D
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape,1)
    print('Reshaped Y: ',preprocess_y.shape)
    return preprocess_x,preprocess_y,x_tk,y_tk

In [18]:
preproc_code_sentences, preproc_plaintext_sentences, code_tokenizer, plaintext_tokenizer = preprocess(codes, plain_text)



Original Y:  (10001, 101)
Reshaped Y:  (10001, 101, 1)


In [19]:
print(preproc_code_sentences[0])
print(preproc_plaintext_sentences[0])

[ 5 14  3  1 10  2 13  3  1  2  4  1 14  3  6  1 10  3  8  4  5  1 10  2
 25  3 11  1 20  6  9  2  5  1 18  1 17  9  5  1  5 14  3  1 17  8  7  8
  7  8  1  2  4  1 13 15  1 10  3  8  4  5  1 10  2 25  3 11  1 19  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0]
[[ 5]
 [14]
 [ 3]
 [ 1]
 [10]
 [ 2]
 [13]
 [ 3]
 [ 1]
 [ 2]
 [ 4]
 [ 1]
 [14]
 [ 3]
 [ 6]
 [ 1]
 [10]
 [ 3]
 [ 8]
 [ 4]
 [ 5]
 [ 1]
 [10]
 [ 2]
 [25]
 [ 3]
 [11]
 [ 1]
 [20]
 [ 6]
 [ 9]
 [ 2]
 [ 5]
 [ 1]
 [18]
 [ 1]
 [17]
 [ 9]
 [ 5]
 [ 1]
 [ 5]
 [14]
 [ 3]
 [ 1]
 [17]
 [ 8]
 [ 7]
 [ 8]
 [ 7]
 [ 8]
 [ 1]
 [ 2]
 [ 4]
 [ 1]
 [13]
 [15]
 [ 1]
 [10]
 [ 3]
 [ 8]
 [ 4]
 [ 5]
 [ 1]
 [10]
 [ 2]
 [25]
 [ 3]
 [11]
 [ 1]
 [19]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]]


In [27]:
preproc_code_sentences.shape

(10001, 101)

In [29]:
#reshaping input data for RNN

tmp_x = pad(preproc_code_sentences, preproc_plaintext_sentences.shape[1])
tmp_x = tmp_x.reshape((-1,preproc_plaintext_sentences.shape[-2],1))

In [21]:
#model creation

from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

'''TimeDistributed: in Keras, when you build a sequential model, 
the last dimension is often related to a time dimension. Remember that RNN takes an input 
at one time step. This wrapper allows you to apply a layer to every temporal slice of an input,
 i.e., the TimeDistributed() method applies the same dense layer to every time step during 
 GRU/LSTM cell unrolling to make the dense nodes of the layer identical at each time step.'''

 

'TimeDistributed: in Keras, when you build a sequential model, \nthe last dimension is often related to a time dimension. Remember that RNN takes an input \nat one time step. This wrapper allows you to apply a layer to every temporal slice of an input,\n i.e., the TimeDistributed() method applies the same dense layer to every time step during \n GRU/LSTM cell unrolling to make the dense nodes of the layer identical at each time step.'

In [22]:
def simple_model(input_shape,output_sequence_length,code_vocab_size,plaintext_vocab_size):

    learning_rate = 1e-3

    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences=True)(input_seq)
    logits = TimeDistributed(Dense(plaintext_vocab_size))(rnn)

    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss=sparse_categorical_crossentropy, 
                    optimizer=Adam(learning_rate=learning_rate),
                    metrics = ['accuracy'])
    
    return model

In [30]:
simple_rnn_model = simple_model(
    tmp_x.shape,
    preproc_plaintext_sentences.shape[1],
    len(code_tokenizer.word_index)+1,
    len(plaintext_tokenizer.word_index)+1
)

2026-02-02 08:56:37.332218: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2026-02-02 08:56:37.332415: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2026-02-02 08:56:37.332489: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.92 GB
2026-02-02 08:56:37.332604: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2026-02-02 08:56:37.333114: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [31]:
simple_rnn_model.fit(tmp_x,preproc_plaintext_sentences,batch_size=64,epochs=6,validation_split=0.2)



Epoch 1/6


2026-02-02 08:57:55.436731: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 84ms/step - accuracy: 0.4648 - loss: 2.0239 - val_accuracy: 0.5727 - val_loss: 1.3698
Epoch 2/6
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 72ms/step - accuracy: 0.6612 - loss: 1.1419 - val_accuracy: 0.7149 - val_loss: 0.9621
Epoch 3/6
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 72ms/step - accuracy: 0.7639 - loss: 0.8282 - val_accuracy: 0.8186 - val_loss: 0.7179
Epoch 4/6
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 71ms/step - accuracy: 0.8469 - loss: 0.6228 - val_accuracy: 0.8732 - val_loss: 0.5402
Epoch 5/6
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 72ms/step - accuracy: 0.8933 - loss: 0.4676 - val_accuracy: 0.9143 - val_loss: 0.4068
Epoch 6/6
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 71ms/step - accuracy: 0.9254 - loss: 0.3532 - val_accuracy: 0.9340 - val_loss: 0.3109


<keras.src.callbacks.history.History at 0x34a6b7440>

In [33]:
#test the model

def logits_to_text(logits,tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits,1) if index_to_words[prediction]!= '<PAD>'])





In [None]:
for i in range(5):
    print('Original Sentence - {}'.format(plain_text[i]))
    rnn_output = simple_rnn_model.predict(tmp_x[i:i+1])[0] #input padded and reshaped version
    print('Predicted Sentence - {}'.format(logits_to_text(rnn_output,
                plaintext_tokenizer).upper()))
    print('*****************************************************************************')

Original Sentence - THE LIME IS HER LEAST LIKED FRUIT , BUT THE BANANA IS MY LEAST LIKED .
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Predicted Sentence - C H E   L I M E   I S   M E R   L E A S T   L I F E D   F R U I T   ,   B U T   T H E   B A N A N A   I S   M O   L E A S T   L I F E D   .
*****************************************************************************
Original Sentence - HE SAW A OLD YELLOW TRUCK .
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted Sentence - C E   S A P   A   D U L   M E L L D C   T R U C W   .
*****************************************************************************
Original Sentence - INDIA IS RAINY DURING JUNE , AND IT IS SOMETIMES WARM IN NOVEMBER .
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Predicted Sentence - T H O I A   I S   T U I N Y   D U R I N G   J U N E   ,   A N D   I T   I S   S O M E T I M E S   J A R Y   I N   N O V E M B E R   .
**********

In [37]:
#Test on new data

x = ['N QNPJ LWFUJX FSI RFSLTJX .']
# Original Sentence - "I LIKE GRAPES AND MANGOES ."
print("Original Sentence - I LIKE GRAPES AND MANGOES .")

x = code_tokenizer.texts_to_sequences(x)
x = pad(x,preproc_plaintext_sentences.shape[1])
x = x.reshape((-1,preproc_plaintext_sentences.shape[-2], 1))

print("Predicted Sentence - ", logits_to_text(simple_rnn_model.predict(x[:1])[0],
            plaintext_tokenizer).upper())

Original Sentence - I LIKE GRAPES AND MANGOES .
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Predicted Sentence -  T E C E W E   M R A C E S   N N D   D A N G Y E S   ,
