In [86]:
import numpy as np
import pandas as pd
import random
alphabet = 'abcdefghijklmnopqrstuvwxyz'

In [87]:
# returns the string from a text file, all in lower case

def turn_into_string(file_name):
    with open(file_name, 'r') as file:
        content = file.read().replace('\n', ' ')
    return content.lower()

bee1 = turn_into_string('../texts/beemovie.txt')
print(bee1[:9] == 'according')

True


In [801]:
# returns a list of words from a string when the string is seperated by spaces. 

def turn_to_list(string):
    return [x.strip('.,!?') for x in string.split(' ') if x != '' or x not in alphabet]

bee1_list = turn_to_list(bee1)
print(bee1_list[1] == 'to')

True


In [460]:
# list of all the most common words in the english language

common_words = turn_to_list(turn_into_string('../texts/google-10000-english-no-swears.txt'))
ciphertext = '''\
UXLIRNXSOPQAKJBJTXSVTJKRHXKKJKLIQQOEBLXRSNJKWQKQKYSAQKLRVQNLJKQD\
QRPLJRSEWIRSWQWJHQKTRKKQRWIXSODXNXJSNJTLIQBRNLLIXNXNUIRLXYSAQKLR\
VQLJAJTJKEJYKQRAQK\
'''
words_from_letters = {x: {}}

print(common_words[2] == 'and')

True


In [453]:
# returns the natural log of the percentage of each bigram from a csv file
# eg 'th' count 100, divide by total number of bigrams for percentage, find natural log.

def read_bigrams(filename):
    df = pd.read_csv(filename, delimiter=' ', names=['bigrams','count'], keep_default_na=False)
    total = sum(df['count'])
    bigrams, count = df['bigrams'], df['count'] / total
    frequency_dict = {bigrams[i].lower(): np.log(count[i]) for i in range(len(bigrams))}
    return frequency_dict

bigrams = read_bigrams('../texts/bigrams.csv')
bigrams_list = [x for x in bigrams.keys()]
print(bigrams['he'] == -3.7599265883973865)

True


In [154]:
# dictinary and list of most frequent monograms in the english language

monograms = {'E' : 12.0,'T' : 9.10,'A' : 8.12,'O' : 7.68,'I' : 7.31,'N' : 6.95,'S' : 6.28,'R' : 6.02,
'H' : 5.92,'D' : 4.32,'L' : 3.98,'U' : 2.88,'C' : 2.71,'M' : 2.61,'F' : 2.30,'Y' : 2.11,'W' : 2.09,'G' : 2.03,
'P' : 1.82,'B' : 1.49,'V' : 1.11,'K' : 0.69,'X' : 0.17,'Q' : 0.11,'J' : 0.10,'Z' : 0.07 }

monograms_list = [l.lower() for l in monograms.keys()]

In [975]:
# classic substitution cipher to encrypt/decrypt normal string - returns an encrypted text[0], and a key as a list[1]

def encrypt(text, key = None):
    if not key:
        key = [l for l in alphabet]
        random.shuffle(key)
    code = ''
    for letter in text.lower():
        if letter not in alphabet:
            code += letter
        else:
            code += key[alphabet.index(letter)]
    return code, key

def decrypt(text, key):
    code = ''
    for letter in text.lower():
        if letter not in alphabet:
            code += letter
        else:
            code += alphabet[key.index(letter)]
    return code

encrypted_bee, test_key = encrypt(bee1)
print(decrypt(encrypted_bee[:100], test_key) == bee1[:100])

True


In [647]:
# score function that takes in a list of bigrams and sums the values from the bigrams dictionary

def score(list_of_bigrams):
    lst = [[x, bigrams[x]] for x in list_of_bigrams]
    return sum([x[1] for x in lst])

print(score(find_ngrams(ciphertext, 2, dic = False)) == -1380.9684879021993)

True


In [606]:
# score function that takes 2 keys, tells how many letters are matching and which ones (letters = True)

def score2(key1, key2, letters = False):
    count = 0
    connects = ''
    for i in range(len(key1)):
        if key1[i] == key2[i]:
            count += 1
            connects += alphabet[i]
    if letters:
        return count, connects
    return count
    
print(score2(alphabet, 'abc11111111111111111111111') == 3)

True


In [1050]:
class MinHeapString:
    def __init__(self, heap_limit = 10):
        self.count = 0
        self.heap_list = [None]
        self.max = None
        self.heap_limit = heap_limit
    def left_child_idx(self, idx):
        return idx * 2
    def right_child_idx(self, idx):
        return idx * 2 + 1
    def parent_idx(self, idx):
        return idx // 2
    def has_children(self, idx):
        return self.left_child_idx(idx) <= self.count
    def smaller_child_idx(self, idx):
        if self.has_children(idx):
            if self.right_child_idx(idx) > self.count:
                return self.left_child_idx(idx)
            else:
                left = self.left_child_idx(idx)
                right = self.right_child_idx(idx)
                if self.heap_list[left][1] < self.heap_list[right][1]:
                    return left
                else:
                    return right 
    def add(self, value):      
        if (self.count < self.heap_limit) or (value[1] > self.heap_list[1][1]):
            self.heap_list.append(value) 
            self.count += 1
            self.heapify_up(self.count)
            if not self.max:
                self.max = value
            elif value[1] > self.max[1]:
                self.max = value
            if self.count > self.heap_limit:
                self.remove_min(
    def remove_min(self):
        smallest = self.heap_list[1]
        temp = self.heap_list[self.count]
        self.heap_list[1] = temp
        self.heap_list[self.count] = smallest
        self.heap_list.pop()
        self.count -= 1
        self.heapify_down()
        return smallest
    def heapify_up(self, idx):
        while self.parent_idx(idx) > 0:
            parent = self.parent_idx(idx)
            if self.heap_list[parent][1] > self.heap_list[idx][1]:
                temp = self.heap_list[parent]
                self.heap_list[parent] = self.heap_list[idx]
                self.heap_list[idx] = temp
            idx = self.parent_idx(idx)
    def heapify_down(self):
        idx = 1
        while self.has_children(idx):
            new_index = self.smaller_child_idx(idx)
            if self.heap_list[idx][1] > self.heap_list[self.smaller_child_idx(idx)][1]:
                temp = self.heap_list[idx]
                self.heap_list[idx] = self.heap_list[self.smaller_child_idx(idx)]
                self.heap_list[self.smaller_child_idx(idx)] = temp
            idx = new_index

In [607]:
# gets permutations from a string or list, returns all possible permutations
# eg 123 - 123,132, 231, 213, 312, 321

class Permutations:
    def __init__(self):
        self.result = []
        self.n = n
    def permute(self, lst):
        self.backtrack(lst, [])
        return self.result
    def backtrack(self, lst, path):
        if not lst:
            self.result.append(path)
        for x in range(len(lst)):
            self.backtrack(lst[:x] + lst[x + 1:], path + [lst[x]])

permute_test = Permutations()
print(permute_test.permute([l for l in 'abc']) == [['a', 'b', 'c'], ['a', 'c', 'b'],['b', 'a', 'c'],
                                                   ['b', 'c', 'a'],['c', 'a', 'b'], ['c', 'b', 'a']])


True


In [934]:
# gets permutations from a string or list, returns all possible permutations of length n, repeats allowed
# eg 123, n = 2 - 11, 12, 13, 21, 22, 23, 31, 32, 33
# this is also a helper function to create a dictionary of ngrams

class Permutations2:
    def __init__(self, n = 1):
        self.result = []
        self.n = n
    def permute(self, lst):
        self.backtrack(lst, [])
        return self.result
    def backtrack(self, lst, path):
        if len(path) == self.n:
            self.result.append(path)           
        for x in range(len(lst)):
            if len(path) == self.n:
                continue
            self.backtrack(lst, path + [lst[x]])
    def convert_to_combinations(self):
        self.result = list(set([tuple(sorted(lst)) for lst in bigrams_set_list if lst.count(lst[0]) == 1]))
        

permute_test2 = Permutations2(n = 2)
print(permute_test2.permute([l for l in 'abc']) == [['a', 'a'], ['a', 'b'], ['a', 'c'], 
                                                    ['b', 'a'], ['b', 'b'], ['b', 'c'], 
                                                    ['c', 'a'], ['c', 'b'], ['c', 'c']])
bigram_set = Permutations2(2)
bigram_set.permute([l for l in alphabet])
bigram_set.convert_to_combinations()
print(len(bigram_set.result) == 325)

True
True


In [820]:
# create a dictionary of all the grams of n length, going to 0

def create_dictionary(n):
    keys = Permutations2(n)
    keys.permute(alphabet)
    dictionary = {''.join(x): 0 for x in keys.result}
    return dictionary

dictionary_test = create_dictionary(2)
print(dictionary_test['bb'] == 0)

True


In [1044]:
# function to take an encryption, count the letters/pairs - by letter count 
# (eg letter count = 1 -> a,b,c,d,e,f,g, letter count = 2 -> ab, ac, ad, ee, gr, th...)
# parameters include dic/list, percentage, ordered, log

def find_ngrams(text, letter_count, dic = True, percentage = True, ordered = True, log = True, zeroes = False):
    text = ''.join([l for l in text.lower() if l in alphabet])
    number_ngrams = len(text) - letter_count + 1
    ngrams = [None for g in range(number_ngrams)]
    for i in range(len(ngrams)):
        ngrams[i] = text[i: i + letter_count]  
    if dic:
        ngrams_set = set(ngrams)
        ngrams_count = create_dictionary(letter_count)
        for g in ngrams:
            ngrams_count[g] += 1
        if percentage:
            for key in ngrams_count.keys():
                ngrams_count[key] /= number_ngrams 
                if log:
                    if ngrams_count[key] == 0:
                        continue
                    ngrams_count[key] = np.log(ngrams_count[key])
        if ordered:
            ngrams_count = sorted(ngrams_count.items(), key=lambda x:x[1], reverse = True)
        if not zeroes:
            ngrams_count = [x for x in ngrams_count if x[1] != 0]
        return ngrams_count
    return ngrams

print(find_ngrams(bee1, 2)[0][0] == 'th')

True


In [611]:
# how many times does the code create a word that is within common words (or special list containing certain characters)
# eg 'asdh sdalfjads sladkfj cat' = 1

def code_matches(text, words = common_words):
    code_list = turn_to_list(text)
    count = 0
    for word in code_list:
        if word in common_words:
            count += 1
    return count

print(code_matches(bee1[:16]) == 3)

True


In [1309]:
# return a list of words (from common words or a given list) made using the letters, or containing

def words_from_letters(letters, word_search = common_words, containing = False):
    letters = set([l for l in letters])
    words = []
    for word in word_search:
        if containing:
            if len(set(list(word)).intersection(letters)) > 0:
                words.append(word)
                continue
        if set(list(word)).issubset(letters):
            words.append(word)
    return words

containing_dictionary = {l: words_from_letters(l, containing = True) for l in alphabet}
print(words_from_letters('o') == ['o', 'oo', 'ooo'])

True


In [1436]:
# return a string swapping letter at index1 with letter at index2

def swap(string, index1, index2):
    if index1 == index2:
        return string
    index1, index2 = min(index1, index2), max(index1, index2)
    return string[:index1] + string[index2] + string[index1 + 1:index2] + string[index1] + string[index2 + 1:]

print(swap('sidney', 1, 2) == 'sdiney')

True


In [1437]:
# get start key for the hill climb from the list of monograms in the encyrypted text

def hillclimb_key(encrypted_text):
    frequency = find_ngrams(encrypted_text, 1, zeroes = True, log = False)
    start_key = ''
    for l in alphabet:
        i = monograms_list.index(l)
        start_key += frequency[i][0]
    return start_key

monogram_key = hillclimb_key(encrypted_bee)
print(hillclimb_key(ciphertext)[0] == 'j')

True


In [1417]:
correct_ciphertext = 'withasingledropofinkforamirrortheegyptiansorcererundertakestorevealtoanychancecomerfarreachingvisionsofthepastthisiswhatiundertaketodoforyoureader'
correct_ciphertext_key = 'rcwaqtoixmvphsjbgknlydufez'
incorrect_letters = 'blvfmpk'

In [1451]:
# second stage of decryption. take mostly completed text, instead of fixing the key, start a new key to alter
# takes user input and asks which letters are in the wrong places. then switches all around

def dictionary_attack5(text):
    print(text)
    incorrect = input('what are the wrong letters?')
    keys = letters_to_keys(incorrect, alphabet)
    keys_count = []
    for i in range(len(keys)):
        decrypted_text = encrypt(text, keys[i])[0]
#         print(decrypted_text)
        count = len([word for word in common_words if word in decrypted_text])
        keys_count.append([keys[i], count])
#         print(i, keys[i], count, decrypted_text == correct_ciphertext)
    keys_count = sorted(keys_count, key=lambda x:x[1], reverse = True)
    while True:
        key = keys_count.pop(0)[0]
        print(encrypt(text, key))
        satisfied = input('is this (r)ight or (w)rong')
        if satisfied == 'r':
            break
    return key

print(dictionary_attack5(decrypt(ciphertext, 'rcwaqboixzvpdsjtgknlyhufem')) == 'abcdepghijklvnofqrstumwxyz')

withasingledrofopinkporavirrortheegyftiansorcererundertakestoremealtoanychancecoverparreachingmisionsopthefastthisiswhatiundertaketodoporyoureader
what are the wrong letters?fpvm
('withasingledropofinkforamirrortheegyptiansorcererundertakestorevealtoanychancecomerfarreachingvisionsofthepastthisiswhatiundertaketodoforyoureader', 'abcdepghijklvnofqrstumwxyz')
is this (r)ight or (w)rongr
True


In [1444]:
# gets the bigrams count, and creates a key that matches a text that most resembles the english language. 

def hillclimb_score(encrypted_text):
    start_key = hillclimb_key(encrypted_text)
    potential_new_keys = MinHeapString()
    potential_new_keys.add([start_key, score(find_ngrams(ciphertext, 2, dic = False))])
    count = 0
    while True:
        count += 1
        for i1 in range(len(start_key)):
            for i2 in range(len(start_key)):
                if i1 == i2:
                    continue
                key = swap(start_key, i1, i2)
                key_score = score(find_ngrams(decrypt(encrypted_text, key), 2, dic = False))
                potential_new_keys.add([key, key_score])
#         start_key = random.choice(potential_new_keys.heap_list[1:])[0]
        start_key = potential_new_keys.max[0]
        if count == 25:
            break
    return start_key

bigram_score_key = hillclimb_score(ciphertext)
print(bigram_score_key == 'rvwaqboixmctdsjpgknlyhufez')

True


In [1443]:
# takes in letters, returns a list of all the possible keys made by mixing those letters within the key

def letters_to_keys(letters, key):
    key_indicies = [key.index(l) for l in letters]
    index_permutations = Permutations()
    index_permutations.permute(key_indicies)
    keys = []
    for lst in index_permutations.result:
        key = list(key)
        for i in range(len(lst)):
            key[lst[i]] = letters[i]
        keys.append(''.join(key))
    return keys

print(letters_to_keys('vm', 'rcwaqtoixmvpdsjbgknlyhufez')
      == ['rcwaqtoixmvpdsjbgknlyhufez', 'rcwaqtoixvmpdsjbgknlyhufez'])

True


In [1446]:
# helper function to take text with 2 keys to decrypt, then encrypt, and therefore decrypt overall

def double_key_combo(text, key1, key2):
    return encrypt(decrypt(text, key1), key2)[0]

In [1450]:
# combination of bigram_hillclimb_score, and dicitonary attack 5. 
# Uses 2 keys, one obtained from each, to decrypt a text, you can get the keys with get_keys parameter

def substitution_decrypt(text, get_keys = False):
    key1 = hillclimb_score(text)
    text2 = decrypt(text, key1)
    key2 = dictionary_attack5(text2)
    if get_keys:
        return double_key_combo(text, key1, key2), [key1, key2]
    return double_key_combo(text, key1, key2)
    
incorrect_letters = 'blvfmpk'
print(substitution_decrypt(ciphertext) == correct_ciphertext)

withasingpedrofolinbloravirrortheegyftiansorcererundertabestoremeaptoanychancecoverlarreachingmisionsolthefastthisiswhatiundertabetodoloryoureader
what are the wrong letters?blvfmpk
('withasingledropofinbforamirrortheegyptiansorcererundertabestorevealtoanychancecomerfarreachingvisionsofthepastthisiswhatiundertabetodoforyoureader', 'abcdepghijkfvnolqrstumwxyz')
is this (r)ight or (w)rongw
('withasingledromofinbforapirrortheegymtiansorcererundertabestorevealtoanychancecoperfarreachingvisionsofthemastthisiswhatiundertabetodoforyoureader', 'abcdemghijkfvnolqrstupwxyz')
is this (r)ight or (w)rongw
('withasingledropofinkforamirrortheegyptiansorcererundertakestorevealtoanychancecomerfarreachingvisionsofthepastthisiswhatiundertaketodoforyoureader', 'akcdepghijbfvnolqrstumwxyz')
is this (r)ight or (w)rongr
True
