# Get English Data

## Text preprocessing functions

In [None]:
# download and preprocess text data from Project Gutenberg via Kaggle
import requests
import re
import glob
import os
import pandas as pd

# function for text extraction
def extraction(raw):
    START = "*** START"
    END = "*** END"
    start = raw.find(START)
    end = raw.find(END)
    if start != -1 and end != -1 and end > start:
        return raw[start:end]
    else:
        return raw  # Fallback if tags are not found

# strip the Gutenberg header and footer
def strip(text):
    start_pattern = r"\*{3} START OF THIS PROJECT GUTENBERG EBOOK .* \*{3}"
    end_pattern = r"\*{3} END OF THIS PROJECT GUTENBERG EBOOK .* \*{3}"
    start = re.search(start_pattern, text)
    end = re.search(end_pattern, text)
    if start and end:
        return text[start.end():end.start()]
    # Removes only leading/trailing whitespaces, preserves all internal whitespace
    return text.strip()

# preprocess the text data, returning list of characters for char mode, else list of words
def preprocess(text, mode='char'):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s.,;!?\'"-]', ' ', text)
    if mode == 'char':
        return [c for c in text if c.isalpha() or c in ' .,;!?\'"-']
    elif mode == 'word':
        return re.findall(r'\b\w+\b', text)
    else:
        raise ValueError("mode must be 'char' or 'word'")

## Get data and preprocess

In [None]:
# download and preprocess text data from Project Gutenberg
import requests
import re
import glob
import os
import pandas as pd

gutenberg_top20 = [
    (1342, "Pride_and_Prejudice"),
    (84, "Frankenstein"),
    (1661, "Adventures_of_Sherlock_Holmes"),
    (11, "Alices_Adventures_in_Wonderland"),
    (98, "A_Tale_of_Two_Cities"),
    (2701, "Moby_Dick"),
    (76, "Adventures_of_Huckleberry_Finn"),
    (2542, "A_Dolls_House"),
    (5200, "Metamorphosis"),
    (120, "Peter_Pan"),
    (74, "The_Adventures_of_Tom_Sawyer"),
    (1400, "Great_Expectations"),
    (160, "The_Yellow_Wallpaper"),
    (23, "Narrative_of_Frederick_Douglass"),
    (219, "The_Picture_of_Dorian_Gray"),
    (4300, "Ulysses"),
    (46, "A_Christmas_Carol"),
    (166, "The_Iliad"),
    (1013, "Anna_Karenina"),
    (1952, "War_and_Peace"),
]

# function for text extraction
def extraction(raw):
    START = "*** START"
    END = "*** END"
    start = raw.find(START)
    end = raw.find(END)
    if start != -1 and end != -1 and end > start:
        return raw[start:end]
    else:
        return raw  # Fallback if tags are not found

# strip the Gutenberg header and footer
def strip(text):
    start_pattern = r"\*{3} START OF THIS PROJECT GUTENBERG EBOOK .* \*{3}"
    end_pattern = r"\*{3} END OF THIS PROJECT GUTENBERG EBOOK .* \*{3}"
    start = re.search(start_pattern, text)
    end = re.search(end_pattern, text)
    if start and end:
        return text[start.end():end.start()]
    # Removes only leading/trailing whitespaces, preserves all internal whitespace
    return text.strip()

# preprocess the text data, returning list of characters for char mode, else list of words
def preprocess(text, mode='char'):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s.,;!?\'"-]', ' ', text)
    if mode == 'char':
        return [c for c in text if c.isalpha() or c in ' .,;!?\'"-']
    elif mode == 'word':
        return re.findall(r'\b\w+\b', text)
    else:
        raise ValueError("mode must be 'char' or 'word'")

# --- Data loading ----- #
books_dir = "gutenberg_top20"
os.makedirs(books_dir, exist_ok=True)
corpus = []

# download texts
for book_id, book_name in gutenberg_top20:
    fname = os.path.join(books_dir, f"{book_id}_{book_name}.txt")
    if not os.path.exists(fname):
        url = f"https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt"
        try:
            response = requests.get(url)
            if response.status_code == 200:
                with open(fname, "w", encoding="utf-8") as f:
                    f.write(response.text)
            else:
                print(f"Download failed for {book_name} ({book_id}) status {response.status_code}")
        except Exception as e:
            print(f"Error downloading {book_name} ({book_id}): {e}")

# extraction and preprocessing
for book_id, book_name in gutenberg_top20:
    fname = os.path.join(books_dir, f"{book_id}_{book_name}.txt")
    try:
        with open(fname, encoding="utf-8") as f:
            raw = f.read()
            content = extraction(raw)
            stripped = strip(content)
            chars = preprocess(stripped, mode='char')
            corpus.append({"id": book_id, "name": book_name, "chars": chars})
    except Exception as e:
        print(f"Error with {fname}: {e}")


# Probability Distributions

In [None]:
# calculate the stationary distribution and transition matrix for the English language
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# get the data from the engtextdata script (contains the text data from the Project Gutenberg books)
data = corpus
# data is a list of dicts from corpus build
if isinstance(data, list) and isinstance(data[0], dict) and "chars" in data[0]:
    # concatenate all character lists into one big string
    text = ''.join(''.join(book['chars']) for book in data)
elif isinstance(data, list):
    # List of strings (full book texts) â€” not your current corpus structure, but useful elsewhere
    text = ''.join(data)
elif isinstance(data, dict):
    # Dict of {'id_or_name': {'chars': [...]}} mappings
    text = ''.join(''.join(v['chars']) for v in data.values())
else:
    # Already a string or unknown
    text = str(data)

chars = list(text)

# get the states and counts
with open('/datasets/t1cw-data/symbols.txt', 'r') as f:
    states = [line.rstrip('\n') for line in f if line.rstrip('\n') != ''] # create a list of unique states
sequence = [char for char in text if char in states]
counts = {beta: defaultdict(int) for beta in states} # create a dictionary to store the counts of each state
# loop through the sequence and count the number of times each state appears after the previous state
for i in range(len(sequence)-1):
    alpha, beta = sequence[i], sequence[i+1]
    counts[beta][alpha] += 1

# initialise probabilities as vector and matrix
n = len(states)
pi = np.zeros((n,)) # stationary distribution is a row vector of size (1xn)
psi = np.zeros((n, n)) # transition matrix: psi[j, i] is the probability of beta (j) given alpha (i), i.e. P(states[j]|states[i])

epsilon = 0.5           # Laplace smoothing to avoid zeros

for j, beta in enumerate(states):         # for each "next" state
    total = sum(counts[beta][alpha] for alpha in states) + n * epsilon
    for i, alpha in enumerate(states):    # for each "previous" state
        # Use integer indices for numpy arrays!
        psi[j, i] = (counts[beta][alpha] + epsilon) / total

# calculate the stationary distribution
evals, evecs = np.linalg.eig(psi) # find the eigenvectors and eigenvalues of the transition matrix
pi = np.real(evecs[:, np.isclose(evals, 1)]) # find the eigenvector corresponding to eigenvalue 1
pi = pi.squeeze() # squeeze the stationary distribution to remove any extra dimensions
pi = pi / pi.sum()  # normalize the stationary distribution

# print the stationary distribution
stationary_df = pd.DataFrame({'State': states, 'Stationary probability': pi})
print(stationary_df.to_string(index=False)) # print as a readable table

# plot the stationary distribution
plt.figure(figsize=(12, 2))
plt.imshow(pi[np.newaxis, :], cmap='hot', aspect='auto')
plt.colorbar(label='Stationary Probability')
plt.xticks(np.arange(len(states)), states, rotation=90)
plt.yticks([0], ['Stationary'])
plt.title('Q.5(a) Stationary Distribution Heatmap')
plt.tight_layout()
plt.show()

# print the transition matrix as a readable table
transition_df = pd.DataFrame(psi, index=states, columns=states) # create a dataframe from the transition matrix
print(transition_df.round(4).to_string()) # print the transition table with 4 decimals

# plot the transition matrix
plt.figure(figsize=(10, 8))
plt.imshow(psi, cmap='hot', aspect='auto') # plot the transition matrix as a heatmap
plt.colorbar(label='Transition Probability')
plt.xticks(np.arange(len(states)), states, rotation=90) # set the x-axis labels to the states
plt.yticks(np.arange(len(states)), states) # set the y-axis labels to the states
plt.title('Q.5(a) English Language Transition Matrix Heatmap (Hot Colormap)') # set the title of the plot
plt.xlabel('Beta State') # set the x-axis label
plt.ylabel('Alpha State') # set the y-axis label
plt.tight_layout() # adjust the layout of the plot
plt.show() # show the plot

# MH

In [None]:
# Metropolis-Hastings Implementation

import numpy as np
import random
import sys
import matplotlib.pyplot as plt

# get the states from symbols.txt
with open('/datasets/t1cw-data/symbols.txt', 'r') as f:
    states = [line.rstrip('\n') for line in f if line.rstrip('\n') != ''] # create a list of unique states
state_to_idx = {s: i for i, s in enumerate(states)} # state-to-index mapping for ease
idx_to_state = {i: s for i, s in enumerate(states)}

# get the encrypted text from message.txt
with open('/datasets/t1cw-data/message.txt', 'r') as g:
    encrypted_text = g.read() # Read the entire file as a string
encrypted_sequence = [char for char in encrypted_text if char in states] # make it a list of permitted characters
encrypted_idx_seq = [state_to_idx[c] for c in encrypted_sequence] # index


def log_likelihood(decoded_sequence, psi, state_to_idx): # Calculate log likelihood
    total = 0.0
    for prev, curr in zip(decoded_sequence[:-1], decoded_sequence[1:]):
        i = state_to_idx[prev]
        j = state_to_idx[curr]
        p = psi[j, i]
        if p > 0:
            total += np.log(p)
        else:
            # smoothing: assign large negative if impossible transition to protect ergodicity
            total += -15  # Penalize impossible transitions
    return total

def decrypt_idx_sequence(seq_idx, best_perm, idx_to_state):
    # For each symbol's index in encrypted sequence, get mapped plaintext index, then symbol
    return [idx_to_state[best_perm[i]] for i in seq_idx]

chains = 10
steps = 50000
burn_in = 400 #set burn-in based on where log graph tapers when steps = 1000

all_best_logs = []
all_best_deciphers = []
best_overall_log = float('-inf')
best_overall_perm = None

n = len(states)
for chain in range(chains):
    # Random or stationary-matching initialization
    current_perm_idx = random.sample(range(n), n)
    log_trace, trace = [], []
    best_chain_perm, best_chain_log = None, float('-inf')

    for step in range(steps):
        i, j = random.sample(range(n), 2)
        proposed_perm = current_perm_idx.copy()
        proposed_perm[i], proposed_perm[j] = proposed_perm[j], proposed_perm[i]
        decoded_current = decrypt_idx_sequence(encrypted_idx_seq, current_perm_idx, idx_to_state)
        decoded_proposed = decrypt_idx_sequence(encrypted_idx_seq, proposed_perm, idx_to_state)
        log_current = log_likelihood(decoded_current, psi, state_to_idx)
        log_proposed = log_likelihood(decoded_proposed, psi, state_to_idx)
        # Metropolis-Hastings acceptance
        accept_prob = 1 if (log_proposed - log_current) > 100 else min(1, np.exp(log_proposed - log_current))
        if random.random() < accept_prob:
            current_perm_idx = proposed_perm
        # Save best perm for this chain
        if log_current > best_chain_log:
            best_chain_log = log_current
            best_chain_perm = current_perm_idx.copy()
        trace.append(current_perm_idx.copy())
        log_trace.append(log_current)
        # PRINT decrypted first 60 chars every 100 steps
        if step % 100 == 0:
            print(f"Chain {chain+1}, Step {step}: {''.join(decoded_current)[:60]}")

    # ===== AFTER ALL STEPS FOR THIS CHAIN: =====
    log_trace_burned = log_trace[burn_in:]  # Remove burn-in if you want

    # After each chain: print, plot, save
    if best_chain_perm is not None:
        deciphered = decrypt_idx_sequence(encrypted_idx_seq, best_chain_perm, idx_to_state)
        all_best_logs.append(best_chain_log)
        all_best_deciphers.append(''.join(deciphered))
        if best_chain_log > best_overall_log:
            best_overall_log = best_chain_log
            best_overall_perm = best_chain_perm.copy()
        # Now plot trace/output for this chain
        plt.plot(log_trace)
        plt.xlabel('Iteration')
        plt.ylabel('Log-Likelihood')
        plt.title(f'MH Log-Likelihood Trace Plot (Chain {chain+1})')
        plt.show()
        with open('/datasets/t1cw-data/deciphered.txt', 'w') as out_file:
            out_file.write(''.join(deciphered))

# After all chains: print/save best of best
if best_overall_perm is not None:
    final_deciphered = decrypt_idx_sequence(encrypted_idx_seq, best_overall_perm, idx_to_state)
    print('Final best decipher:', ''.join(final_deciphered[:100]))


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cb182644-878e-48cb-992b-68a78a5afe3d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>