Include xor() function from prior challenge.

In [1]:
from numpy import corrcoef
import string
import array
import numpy as np
import collections

with open("google-10000-english/google-10000-english-usa.txt") as f:
    word_list = map( lambda x: x.rstrip().upper(), f.readlines() )
    
%run 'set_01_challenge_02.ipynb'

######  Score string to see if it is English

Scoring function based on correlation [english relative letter frequency table](http://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html):

This simple correlation coefficient wouldn't be very effective for language classification but may work well for simply distinguishing between incorrect and correct plaintext.

In [2]:
def score_english_letter_cor(txt):
    # English language letter frequency table for A..Z
    freq_table = array.array('f', [8.12, 1.49, 2.71, 4.32, 12.02, 2.30, 2.03, 5.92, 7.31, 0.10, 0.69, 3.98, 2.61, 6.95, 7.68, 1.82, 0.11, 6.02, 6.28, 9.10, 2.88, 1.11, 2.09, 0.17, 2.11, 0.07])

    # Compute txt letter frequency
    txt_freq_dict = collections.defaultdict(int)
    txt_freq = []
    n_txt = float(len(txt))
    
    # Upcase, filter to keep just uppercase characters which drops punctuation
    # Build a relative frequency table for the input txt and test for correlation to english
    for c in filter( str.isupper, txt.upper() ):
        txt_freq_dict[c] += 1
    for l in string.ascii_uppercase:
        txt_freq += [ txt_freq_dict[l] / n_txt ] 
    eng_cor = corrcoef(freq_table, txt_freq)[0, 1]
    
    return eng_cor

In [3]:
def score_english_words(txt, word_list):
    return len( filter( lambda word: word in word_list, txt.upper().split() ) )

###### Decode hex encoded strings

In [4]:
def hex_to_ascii(hstr):
    # Handle invalid hex strings by returning an empty string.
    try:
        return bytearray.fromhex( hstr ).decode('ascii').encode('ascii', 'ignore')
    except:
        return ""

Tests from the [Fixed XOR](http://cryptopals.com/sets/1/challenges/2) exercise.

In [5]:
hex_to_ascii('686974207468652062756c6c277320657965')

"hit the bull's eye"

In [6]:
hex_to_ascii('746865206b696420646f6e277420706c6179')

"the kid don't play"

###### Search for decryption key

In [7]:
def simple_xor_decrypt(hex_ciphertxt):
    plaintxt = ""
    
    len_ciphertxt = len(hex_ciphertxt)/2
    candidates = {}
    for l in string.ascii_letters:
        candidate_key = (l * len_ciphertxt).encode('hex')
        candidates[l] = {}
        candidates[l]['hex_plaintxt'] = xor( hex_ciphertxt, candidate_key )
        candidates[l]['plaintxt'] = hex_to_ascii( candidates[l]['hex_plaintxt'] )
        candidates[l]['score'] = score_english_letter_cor( candidates[l]['plaintxt'] ) * score_english_words( candidates[l]['plaintxt'], word_list )
        
    best_key = max( candidates.keys(), key = lambda x: candidates[x]['score'])
    return candidates[ best_key ]['plaintxt']

In [8]:
simple_xor_decrypt("1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736")

"Cooking MC's like a pound of bacon"

<img src="http://cdn.meme.am/instances/61639139.jpg"/>