In [110]:
from collections import defaultdict, namedtuple
from importlib import reload
from pathlib import Path
import sys
from string import ascii_lowercase as LETTERS
from random import sample, choice, uniform
from time import time
from math import log, exp

from tqdm import tqdm
import numpy as np
from sklearn.datasets import fetch_20newsgroups

sys.path.insert(0, str(Path.cwd().parents[0] / 'src')) 
import crypto as cg; reload(cg)

<module 'crypto' from '/Users/chris/Documents/cryptogram-solver/src/crypto.py'>

In [111]:
news_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=20180827)
news_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=20180827)

In [112]:
tokenizer_test = cg.Tokenizer()
tokenizer_test.tokenize("This is a sentence.")

defaultdict(int,
            {Token(ngrams=('this',), kind=<NgramKind: word>, n=1): 1,
             Token(ngrams=('is',), kind=<NgramKind: word>, n=1): 1,
             Token(ngrams=('a',), kind=<NgramKind: word>, n=1): 1,
             Token(ngrams=('sentence',), kind=<NgramKind: word>, n=1): 1,
             Token(ngrams=('<',), kind=<NgramKind: char>, n=1): 4,
             Token(ngrams=('t',), kind=<NgramKind: char>, n=1): 2,
             Token(ngrams=('h',), kind=<NgramKind: char>, n=1): 1,
             Token(ngrams=('i',), kind=<NgramKind: char>, n=1): 2,
             Token(ngrams=('s',), kind=<NgramKind: char>, n=1): 3,
             Token(ngrams=('>',), kind=<NgramKind: char>, n=1): 4,
             Token(ngrams=('<', 't'), kind=<NgramKind: char>, n=2): 1,
             Token(ngrams=('t', 'h'), kind=<NgramKind: char>, n=2): 1,
             Token(ngrams=('h', 'i'), kind=<NgramKind: char>, n=2): 1,
             Token(ngrams=('i', 's'), kind=<NgramKind: char>, n=2): 2,
             Token

In [177]:
reload(cg)
tokenizer = cg.Tokenizer(char_ngram_range=(1, 3), word_ngram_range=(1, 1), vocab_size=1000000)
tokenizer.fit(news_train['data'][:5000])
print(len(tokenizer.vocab))
print(tokenizer.totals)

100%|██████████| 5000/5000 [00:54<00:00, 91.56it/s]

73829
defaultdict(<class 'int'>, {(<NgramKind: word>, 1): 1586776, (<NgramKind: char>, 1): 9938770, (<NgramKind: char>, 2): 8351994, (<NgramKind: char>, 3): 6765218})





In [178]:
tokenizer.totals

defaultdict(int,
            {(<NgramKind: word>, 1): 1586776,
             (<NgramKind: char>, 1): 9938770,
             (<NgramKind: char>, 2): 8351994,
             (<NgramKind: char>, 3): 6765218})

In [179]:
doc = cg.Doc(news_test['data'][0])
mapper = cg.Mapping()
mapper.scramble()
doc = mapper.translate(doc)
doc

'Ftni: pttnibzz@hjslzmdjns.ul.hos.zbs (Kzm Attnibzz)\nSskhzuq: Rz: Il wq xnnb qopq Jzlsl bwzb?\nOtxpmwfpqwnm: Jnoml Hnadwml Umweztlwqj CS Dzaq.\nLwmzl: 31\n\nIm ptqwucz <1993Aat26.215627.24917@ntxpmawaz.ssx.ptwfnmp.zbs> ktwpm@cac.ptwfnmp.zbs (Btwpm Czuuptzccw 602/621-9615) gtwqzl:\n>A kpkj\'l wmmnuzmuz opl mnqowmx qn bn gwqo gozqozt qoz kpkj\n>wl p lwmmzt.  Immnuzmuz pmb qoz lwm mpqstz ptz qgn bwvvztzmq pqqtwksqzl. \n>Toz kpkj wl wmmnuzmq, jzq qoz kpkj wl p lwmmzt.   \n>Yns opez qgn ptil pmb qgn czxl?  Woj?  Bzupslz jnst aptzmql bwb.\n>Woj? Bzupslz qozwt aptzmql bwb.  Equ.  Dwb jns bn pmjqowmx qn xzq qozi?\n\nToz qowmx wl, I dmng gopq ptil pmb czxl ptz.  Iq\'l qoztzvntz xzmztpccj zplj qn\nqzcc gozqozt nt mnq lniznmz opl ptil pmb czxl.  Towl "lwmvsc mpqstz", lwmuz wq\nbnzl mnq tzyswtz qopq qoz kpkj puqspccj aztvnti pmj lwml, lzzil qn kz qnqpccj\nwmewlwkcz.  Al vpt pl I dmng, ipjkz opcv qoz kpkwzl opez p lwmvsc mpqstz pmb\nopcv bnm\'q--wq\'b cnnd zrpuqcj qoz lpiz, lwmuz qoztz wl mn gpj q

In [None]:
import seaborn as sns

In [269]:
np.exp(np.linspace(1, -6, 5))

array([  2.71828183e+00,   4.72366553e-01,   8.20849986e-02,
         1.42642339e-02,   2.47875218e-03])

In [205]:
np.exp(np.linspace(10, -10, 100))

array([  2.20264658e+04,   2.38696456e+03,   2.58670631e+02,
         2.80316249e+01,   3.03773178e+00,   3.29192988e-01,
         3.56739933e-02,   3.86592014e-03,   4.18942123e-04,
         4.53999298e-05])

In [305]:
'''
Ideas:
> Implement simulated annealing properly (single swap and decision), with a reasonable temp scheduler.
  - This works! But need to play with temperature scheduler since results are sensitive to even small changes.
> Implement simulated annealing but using softmax and subsetting swap options.
> More swaps in the beginning, fewer later.
'''

solver = cg.Solver(tokenizer, None, 0.1)

def simulated_annealing(text, solver, tokenizer, max_epochs=10000, is_debug=True):
    best_mapping = mapping = cg.Mapping()
    doc = cg.Doc(text)
    best_score = solver.score(doc)
    epoch = 0
    decisions = defaultdict(int)
    temps = np.exp(np.linspace(0, -6, 10000))
    for temp in tqdm(temps):
        # print('*', end='')
        improving = False
        new_mapping = mapping.random_swap(doc.letters)
        new_doc = new_mapping.translate(doc)
        score = solver.score(new_doc)
        score_change = score - best_score
        # if score_change < 0 or exp(-score_change / temp) > uniform(0, 1):
        #     print(f'score change: {score_change}')
        #     print('updating')
        #     best_mapping = mapping
        #     best_score = score
        if score_change < 0:
            # print('updating (improvement)')
            best_mapping = new_mapping
            best_score = score
            decisions['good'] += 1
        elif exp(-score_change / temp) > uniform(0, 1):
            # Break this out as different section just for debugging.
            # print(f'updating (bad change): {score_change}')
            best_mapping = new_mapping
            best_score = score
            decisions['bad_keep'] += 1
        else:
            decisions['bad_pass'] += 1
            # print(f'keeping: {score_change}')
        mapping = best_mapping
        epoch += 1
        if epoch % 1000 == 0:
            print(f'{score:0.5g}, {mapping.mapping}, {mapping.translate(doc).text}')
            print(sorted(list(decisions.items())))
            decisions = defaultdict(int)
    print(f'\nfinal best ({epoch} epochs): {best_score:0.5g}')
    return mapping.translate(doc).text

In [308]:
# text = doc.text
# text = 'Rbo rpktigo vcrb bwucja wj kloj hcjd, km sktpqo, cq rbwr loklgo vcgg cjqcqr kj skhcja wgkja wjd rpycja rk ltr rbcjaq cj cr. -- Roppy Lpwrsborr'
text = 'This is the story of a girl. Who cried a river and drowned the whole world. And while she looked so sad in photographs, I absolutely love her when she smiles.'
# text = "I've found that when everyone rallies behind a cause, and when they learn their effort can contribute something bigger, they get engaged."
doc = cg.Doc(text.lower())
mapping = cg.Mapping()
mapping.scramble()
doc = mapping.translate(doc)
doc.text

'dlst st dlh tdjak jy w zsao. flj iashc w asuha wvc cajfvhc dlh fljoh fjaoc. wvc flsoh tlh ojjqhc tj twc sv eljdjzawelt, s wgtjomdhok ojuh lha flhv tlh trsoht.'

In [310]:
timer = cg.Timer()
timer.tic()
text = simulated_annealing(doc.text, solver, tokenizer)
timer.toc()
print(text)

 11%|█         | 1086/10000 [00:01<00:13, 667.15it/s]

15.447, ugrhwoklfdnbyceszqtpjavimx, jhps ps jhd sjuvg um e qpvf. ihu xvpdn e vpadv ewn nvuiwdn jhd ihufd iuvfn. ewn ihpfd shd fuurdn su sen pw ohujuqveohs, p ebsufyjdfg fuad hdv ihdw shd scpfds.
[('bad_keep', 450), ('bad_pass', 135), ('good', 415)]


 21%|██        | 2097/10000 [00:03<00:12, 649.71it/s]

14.19, hoqylxgcventmpbwrdjaufkisz, reyl yl rea lrstw sd p zytb. ves xtyah p tyuat pih htsviah rea vesba vstbh. pih veyba lea bsscah ls lph yi jesrsztpjel, y pglsbmrabw bsua eat veai lea lqybal.
[('bad_keep', 363), ('bad_pass', 227), ('good', 410)]


 31%|███       | 3079/10000 [00:04<00:10, 661.57it/s]

13.517, simbfxrvouktecldzjhwgpnqay, poal al pos lpryk rz t qayi. eor byasn t yajsy thn nyrehsn pos eoris eryin. thn eoais los irrxsn lr ltn ah morprqytmol, a tulricpsik irjs osy eosh los lgaisl.
[('bad_keep', 311), ('bad_pass', 381), ('good', 308)]


 41%|████▏     | 4131/10000 [00:06<00:08, 665.53it/s]

12.691, juachyelqpgobwsrxvtkmdfzin, vhos os vhe svact af n xocl. wha ycoed n cobec nrd dcawred vhe whale wacld. nrd whole she laaied sa snd or ghavaxcnghs, o nksaluvelt labe hec wher she spoles.
[('bad_keep', 237), ('bad_pass', 539), ('good', 224)]


 51%|█████     | 5107/10000 [00:07<00:07, 667.96it/s]

12.731, jzechyblspqaougknvtdxifrwm, this is the stalp af y bilm. wha vlied y linel yrd dlawred the whame walmd. yrd whime she maaked sa syd ir chatablychs, i yosamztemp mane hel wher she sximes.
[('bad_keep', 89), ('bad_pass', 827), ('good', 84)]


 61%|██████    | 6084/10000 [00:09<00:05, 668.67it/s]

11.654, wgechzulsxnoqvjrbatdmpfyki, this is the story ox a firl. who zried a riger and drowned the whole world. and while she loomed so sad in chotofrachs, i absolutely loge her when she spiles.
[('bad_keep', 39), ('bad_pass', 909), ('good', 52)]


 71%|███████▏  | 7139/10000 [00:10<00:04, 670.61it/s]

11.094, wgechyzlspqonvjrbatdmufxki, this is the story of a girl. who zried a river and drowned the whole world. and while she looked so sad in chotograchs, i absolutely love her when she spiles.
[('bad_keep', 8), ('bad_pass', 977), ('good', 15)]


 81%|████████  | 8096/10000 [00:12<00:02, 670.05it/s]

12.167, wgechizlspqorvjybatdmufxkn, this is the story op a girl. who fried a river and drowned the whole world. and while she looked so sad in chotograchs, i absolutely love her when she smiles.
[('bad_keep', 2), ('bad_pass', 995), ('good', 3)]


 91%|█████████ | 9096/10000 [00:13<00:01, 672.45it/s]

10.748, wgechyzlspqorvjibatdmufxkn, this is the story of a girl. who pried a river and drowned the whole world. and while she looked so sad in chotograchs, i absolutely love her when she smiles.
[('bad_keep', 1), ('bad_pass', 997), ('good', 2)]


100%|██████████| 10000/10000 [00:14<00:00, 671.47it/s]

11.454, wgichyzlspqorvjebatdmufxkn, this is the story of a girl. who cried a river and drowned the whole world. and while she looked so sad in photographs, i absolutely love her when she smiles.
[('bad_keep', 2), ('bad_pass', 997), ('good', 1)]

final best (10000 epochs): 10.3
this is the story of a girl. who cried a river and drowned the whole world. and while she looked so sad in photographs, i absolutely love her when she smiles.



