The task is to construct a recurrent neural network to decrypt the text encrypted by [Caesars cipher](https://en.wikipedia.org/wiki/Caesar_cipher).

In [None]:
import math
from random import shuffle
import time

import torch

In [None]:
BATCH_SIZE = 10
STRING_SIZE = 60
NUM_EPOCHS = 20
LEARNING_RATE = 0.05
FILE_NAME = "/kaggle/input/frank-herbert-dune/Dune.txt"
DEVICE = "cpu"
CAESAR_OFFSET = 2

Because Caesar cipher operates the concept of alphabet, it is necessary to make a similar alphabet from the symbols available in the text.

As a sample for the preparation of the alphabet and dataset for learning a model, we will take the text of the book of Frank Herbert - Dune, translated into Russian.

In [None]:
class Alphabet(object):

    def __init__(self):
        self.letters = ""

    def __len__(self):
        return len(self.letters)

    def __contains__(self, item):
        return item in self.letters

    def __getitem__(self, item):
        if isinstance(item, int):
            return self.letters[item % len(self.letters)]
        elif isinstance(item, str):
            return self.letters.find(item)

    def __str__(self):
        letters = " ".join(self.letters)
        return f"Alphabet is:\n {letters}\n {len(self)} chars"

    def load_from_file(self, file_path):
        with open(file_path) as file:
            while True:
                text = file.read(STRING_SIZE)
                if not text:
                    break
                for ch in text:
                    if ch not in self.letters:
                        self.letters += ch
        return self


ALPHABET = Alphabet().load_from_file(FILE_NAME)
print(ALPHABET)

Create objects of training, test and validation data sets.

In [None]:
class SentenceDataset(torch.utils.data.Dataset):

    def __init__(self, raw_data, alphabet):
        super().__init__()
        self._len = len(raw_data)
        self.y = torch.tensor(
            [[alphabet[ch] for ch in line] for line in raw_data]
        ).to(DEVICE)
        self.x = torch.tensor(
            [[i + CAESAR_OFFSET for i in line] for line in self.y]
        ).to(DEVICE)
    
    def __len__(self):
        return self._len

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [None]:
def get_text_array(file_path, step):
    text_array = []
    with open(file_path) as file:
        while True:
            text = file.read(STRING_SIZE)
            if not text:
                break
            text_array.append(text)
    del text_array[-1]
    return text_array

In [None]:
raw_data = get_text_array(FILE_NAME, STRING_SIZE)
shuffle(raw_data)
_10_percent = math.ceil(len(raw_data) * 0.1)
val_data = raw_data[:_10_percent]
raw_data = raw_data[_10_percent:]
_20_percent = math.ceil(len(raw_data) * 0.2)
test_data = raw_data[:_20_percent]
train_data = raw_data[_20_percent:]

Y_val = torch.tensor([[ALPHABET[ch] for ch in line] for line in val_data])
X_val = torch.tensor([[i + CAESAR_OFFSET for i in line] for line in Y_val])

train_dl = torch.utils.data.DataLoader(
    SentenceDataset(
        train_data, ALPHABET
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True
)
test_dl = torch.utils.data.DataLoader(
    SentenceDataset(
        test_data, ALPHABET
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True
)

Our RNN network will represent a fairly simple model with a layer embeding, then there will be RNN cell and output linear layer.

It is worth noting that to the size of the input values in the Embeding layer and the size of the output values of the linear layer must be added to the number that we will shift the alphabet to encrypt text.

In [None]:
class RNNModel(torch.nn.Module):
    
    def __init__(self):
        super().__init__()
        self.embed = torch.nn.Embedding(len(ALPHABET) + CAESAR_OFFSET, 32)
        self.rnn = torch.nn.RNN(32, 128, batch_first=True)
        self.linear = torch.nn.Linear(128, len(ALPHABET) + CAESAR_OFFSET)

    def forward(self, sentence, state=None):
        embed = self.embed(sentence)
        o, h = self.rnn(embed)
        return self.linear(o)

In [None]:
model = RNNModel().to(DEVICE)
loss = torch.nn.CrossEntropyLoss().to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc, iter_num = .0, .0, .0
    start_epoch_time = time.time()
    model.train()
    for x_in, y_in in train_dl:
        x_in = x_in
        y_in = y_in.view(1, -1).squeeze()
        optimizer.zero_grad()
        out = model.forward(x_in).view(-1, len(ALPHABET) + CAESAR_OFFSET)
        l = loss(out, y_in)
        train_loss += l.item()
        batch_acc = (out.argmax(dim=1) == y_in)
        train_acc += batch_acc.sum().item() / batch_acc.shape[0]
        l.backward()
        optimizer.step()
        iter_num += 1
    print(
        f"Epoch: {epoch}, loss: {train_loss:.4f}, acc: "
        f"{train_acc / iter_num:.4f}",
        end=" | "
    )
    test_loss, test_acc, iter_num = .0, .0, .0
    model.eval()
    for x_in, y_in in test_dl:
        x_in = x_in
        y_in = y_in.view(1, -1).squeeze()
        out = model.forward(x_in).view(-1, len(ALPHABET) + CAESAR_OFFSET)
        l = loss(out, y_in)
        test_loss += l.item()
        batch_acc = (out.argmax(dim=1) == y_in)
        test_acc += batch_acc.sum().item() / batch_acc.shape[0]
        iter_num += 1
    print(
        f"test loss: {test_loss:.4f}, test acc: {test_acc / iter_num:.4f} | "
        f"{time.time() - start_epoch_time:.2f} sec."
    )

In [None]:
idx = 256
val_results = model(X_val.to(DEVICE)).argmax(dim=2)
val_acc = (val_results == Y_val.to(DEVICE)).flatten()
val_acc = (val_acc.sum() / val_acc.shape[0]).item()
out_sentence = "".join([ALPHABET[i.item()] for i in val_results[idx]])
true_sentence = "".join([ALPHABET[i.item()] for i in Y_val[idx]])
print(f"Validation accuracy is : {val_acc:.4f}")
print("-" * 20)
print(f"Validation sentence is: \"{out_sentence}\"")
print("-" * 20)
print(f"True sentence is:       \"{true_sentence}\"")

Because Caesar's cipher is a fairly primitive way to encrypt text, our model is quickly learning and shows excellent accuracy on test data.

In [None]:
sentence = """Барбадос – островное государство в восточной части Карибского 
моря, входящее в Британское Содружество наций. Столицу, город-порт Бриджтаун, 
отличает колониальная архитектура. Одна из достопримечательностей – синагога, 
построенная в 1654 году."""
sentence_idx = [ALPHABET[i] for i in sentence]
encrypted_sentence_idx = [i + CAESAR_OFFSET for i in sentence_idx]
encrypted_sentence = "".join([ALPHABET[i] for i in encrypted_sentence_idx])
result = model(torch.tensor([encrypted_sentence_idx]).to(DEVICE)).argmax(dim=2)
deencrypted_sentence = "".join([ALPHABET[i.item()] for i in result.flatten()])
print(f"Encrypted sentence is : {encrypted_sentence}")
print("-" * 20)
print(deencrypted_sentence)

However, if we take the text that contains the characters that are not included in the training sample, we can get text decryption errors.