In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from itertools import cycle
from string import ascii_lowercase, ascii_uppercase
from flashtext import KeywordProcessor

train = pd.read_csv("../input/ciphertext-challenge-iii/train.csv", index_col='index', usecols=['index', 'text'])
test = pd.read_csv('../input/ciphertext-challenge-iii/test.csv', index_col='ciphertext_id')
sub = pd.read_csv('../input/ciphertext-challenge-iii/sample_submission.csv', index_col='ciphertext_id')

def decode_level_1(text, i=0, key='pyle'):
    key = [ord(x) - 97 for x in key]
    def substitute(char):
        nonlocal i
        if char in ascii_lowercase and char != 'z':
            char = chr((ord(char) - 97 - key[i]) % 25 + 97)
            i = (i + 1) % len(key)
        if char in ascii_uppercase and char != 'Z':
            char = chr((ord(char) - 65 - key[i]) % 25 + 65)
            i = (i + 1) % len(key)
        return char
    return ''.join([substitute(x) for x in text])

def rail_pattern(n):
    r = list(range(n+1))
    return cycle(r + r[-2:0:-1])

def decode_level_2(text, rails=20):
    p = rail_pattern(rails)
    indexes = sorted(range(len(text)), key=lambda i: next(p))
    result = [''] * len(text)
    for i, c in zip(indexes, text):
        result[i] = c
    return ''.join(result)

from urllib.request import urlopen
with urlopen("https://www.gutenberg.org/files/46464/46464-0.txt") as key_file:
    key_level3 = key_file.read().decode('utf-8').replace('\r', ' ').replace('\n', ' ')
print(len(key_level3))

def decode_level_3(text, key=key_level3):
    return ''.join([key_level3[int(n)] for n in text.split(" ") if n != ''])

key4 = [49, 36, 97, 134, 109, 43, 4, 250, 67, 119, 137, 145, 139, 96, 180, 34, 149, 124, 252, 17, 90, 66, 119, 90, 189, 154, 228, 249, 189, 132, 133, 80, 144, 129, 8, 48, 162, 33, 208, 124, 176, 51, 51, 253, 201, 19, 40, 34, 108, 245, 150, 222, 205, 226, 82, 239, 75, 167, 42, 244, 128, 62, 13, 178, 60, 74, 82, 62, 127, 94, 32, 29, 251, 196, 250, 139, 62, 149, 235, 20, 76, 40, 143, 191, 184, 20, 104, 72, 128, 117, 178, 119, 138, 203, 77, 104, 244, 100, 24, 47, 49, 179, 62, 255, 70, 92, 163, 181, 215, 248, 123, 236, 239, 43, 49, 190, 157, 76, 53, 116, 188, 144, 75, 203, 146, 184, 159, 182, 49, 253, 14, 70, 202, 95, 162, 119, 113, 239, 181, 143, 3, 208, 163, 17, 74, 67, 159, 250, 249, 110, 255, 46, 83, 110, 16, 250, 166, 207, 157, 191, 18, 118, 250, 8, 143, 53, 98, 40, 17, 27, 161, 6, 147, 80, 223, 75, 61, 150, 187, 155, 86, 227, 255, 32, 188, 180, 137, 219, 215, 135, 247, 247, 200, 252, 82, 100, 126, 24, 179, 71, 0, 67, 19, 27, 26, 155, 197, 183, 213, 76, 246, 200, 244, 4, 75, 212, 70, 131, 154, 89, 169, 251, 16, 113, 73, 62, 19, 170, 190, 202, 155, 27, 28, 23, 78, 85, 153, 19, 146, 170, 107, 225, 175, 30, 173, 74, 95, 244, 187, 178, 121, 54, 137, 162, 10, 151, 155, 63, 3, 139, 232, 13, 184, 219, 180, 119, 175, 112, 211, 156, 62, 76, 85, 241, 52, 138, 142, 156, 157, 14, 161, 235, 103, 101, 252, 66, 153, 156, 234, 75, 43, 21, 105, 111, 106, 240, 175, 214, 108, 177, 202, 9, 212, 29, 164, 200, 60, 242, 13, 115, 121, 201, 58, 82, 113, 174, 118, 152, 241, 3, 151, 238, 135, 220, 209, 2, 94, 228, 237, 116, 58, 6, 21, 27, 236, 227, 198, 233, 190, 69, 254, 205, 63, 239, 20, 122, 111, 235, 126, 165, 168, 150, 166, 12, 125, 161, 188, 22, 8, 46, 229, 75, 54, 186, 213, 99, 42, 47, 26, 96, 153, 90, 123, 26, 223, 3, 151, 229, 203, 16, 98, 9, 116, 186, 188, 96, 102, 77, 53, 239, 208, 228, 121, 200, 217, 18, 12, 172, 212, 233, 27, 39, 248, 211, 44, 180, 163, 46, 175, 180, 26, 182, 207, 215, 141, 15, 244, 227, 219, 6, 12, 181, 58, 79, 155, 17, 73, 171, 215, 78, 1, 177, 115, 236, 68, 21, 194, 172, 84, 177, 224, 234, 7, 40, 232, 214, 240, 66, 59, 79, 153, 4, 190, 216, 221, 47, 156, 23, 111, 118, 137, 254, 140, 130, 228, 221, 68, 25, 13, 86, 118, 20, 190, 74, 145, 183, 62, 195, 223, 182, 145, 86, 107, 151, 198, 215, 254, 74, 204, 113, 120, 195, 187, 198, 245, 46, 203, 119, 217, 6, 2, 226, 188, 10, 87, 84, 109, 43, 226, 79, 103, 28, 72, 145, 170, 70, 246, 160, 186, 121, 72, 247, 158, 88, 34, 140, 72, 81, 38, 250, 35, 92, 181, 163, 120, 63, 16, 51, 179, 150, 212, 159, 255, 122, 225, 114, 24, 73, 196, 80, 253, 5, 165, 241, 60, 236, 176, 68, 251, 158, 14, 90, 181, 134, 174, 232, 87, 114, 10, 32, 15, 213, 128, 227, 83, 28, 43, 75, 218, 234, 216, 53, 200, 51, 44, 118, 232, 78, 73, 106, 82, 48, 138, 230, 86, 252, 114, 3, 227, 17, 68, 61, 101, 4, 208, 79, 103, 97, 29, 191, 29, 151, 145, 45, 95, 202, 199, 70, 169, 150, 201, 255, 58, 112, 104, 66, 181, 118, 61, 49, 164, 200, 32, 79, 27, 131, 161, 217, 219, 55, 23, 39, 248, 155, 197, 41, 40, 116, 229, 106, 131, 220, 137, 23, 202, 106, 100, 23, 14, 72, 238, 157, 200, 38, 235, 26, 141, 157, 166, 14, 225, 13, 195, 61, 163, 86, 134, 247, 33, 100, 169, 170, 71, 114, 231, 14, 192, 155, 122, 218, 86, 83, 237, 71, 113, 176, 75, 217, 133, 91, 214, 24, 134, 168, 40, 27, 218, 11, 59, 87, 192, 56, 58, 27, 241, 214, 107, 235, 157, 197, 69, 126, 91, 67, 185, 37, 96, 46, 205, 17, 226, 227, 127, 178, 45, 197, 117, 151, 128, 82, 35, 98, 112, 45, 157, 233, 79, 180, 147, 74, 195, 255, 193, 96, 201, 12, 88, 234, 253, 174, 0, 15, 28, 96, 231, 100, 70, 29, 200, 111, 110, 55, 85, 205, 130, 222, 251, 154, 44, 107, 170, 224, 86, 40, 156, 208, 185, 39, 48, 167, 243, 248, 17, 227, 70, 120, 141, 83, 245, 92, 76, 142, 245, 97, 165, 177, 154, 147, 175, 222, 166, 177, 222, 73, 174, 234, 26, 167, 194, 130, 210, 239, 198, 70, 85, 253, 3, 21, 131, 93, 108, 92, 158, 137, 186, 9, 110, 120, 124, 248, 20, 102, 239, 167, 181, 12, 165, 229, 32, 131, 23, 57, 194, 182, 194, 181, 65, 3, 177, 2, 129, 157, 211, 84, 64, 190, 144, 122, 162, 198, 103, 144, 117, 182, 157, 205, 231, 165, 253, 120, 205, 117, 18, 139, 238, 244, 172, 4, 209, 140, 199, 15, 6, 217, 5, 213, 117, 209, 58, 104, 150, 7, 248, 106, 29, 245, 224, 224, 95, 243, 79, 225, 163, 179, 60, 108, 144, 95, 191, 109, 24, 18, 10, 87, 233, 8, 179, 195, 106, 125, 13, 35, 53, 202, 25, 28, 208, 18, 19, 17, 189, 254, 44, 29, 11, 197, 98, 59, 188, 74, 44, 221, 161, 108, 4, 160, 19, 128, 198, 37, 119, 17, 40, 22, 236, 214, 76, 108, 125, 49, 16, 136, 38, 234, 164, 142, 40, 120, 11, 73, 42, 54, 181, 62, 230, 7, 101, 113, 163, 172, 225, 86, 88, 17, 40, 40, 236, 121, 104, 50, 49, 243, 54, 231, 185, 238, 121, 52, 78, 192, 31, 61, 234, 153, 120, 177, 143, 233, 31, 150, 20, 172, 70, 224, 141, 100, 69, 9, 38, 66, 241, 102, 175, 222, 51, 251, 230, 127, 220, 36, 116, 226, 174, 94, 101, 202, 46, 126, 87, 131, 111, 123, 110, 242, 61, 120, 238, 241, 246, 161, 24, 209, 99, 144, 73, 152, 143, 70, 180, 143, 20, 118, 168, 63, 174, 142, 209, 165, 92, 108, 83, 88, 61, 149, 157, 247, 240, 230, 83, 198, 167, 247, 199, 102, 83, 230, 66, 217, 158, 194, 219, 226, 226, 95, 110, 56, 161, 154, 114, 46, 94, 191, 115, 60, 247, 205, 113, 167, 21, 251, 135, 72, 29, 3, 26, 161, 2, 48, 106, 228, 71, 184, 198, 171, 244, 108, 134, 70, 153, 144, 113, 29, 178, 113, 160, 173, 208, 8, 103, 48, 114, 244, 77, 126, 188, 159, 161, 163, 41, 251, 199, 245, 157, 84, 184, 251, 189, 91, 177, 159, 187, 147, 245, 88, 121, 52, 61, 29, 22, 94, 179, 127, 241, 255, 191, 90, 222, 29, 154, 153, 253, 27, 254, 95, 118, 31, 159, 52, 62, 221]
import base64
def decode_level_4(text, key=key4):
    return ''.join([chr(a^b) for a,b in zip(base64.b64decode(text.encode()),key)])

test.loc[test["difficulty"] == 1,"text"] = test.loc[test["difficulty"] == 1,"ciphertext"].map(lambda x: decode_level_1(x))
test.loc[test["difficulty"] == 2,"text"] = test.loc[test["difficulty"] == 2,"ciphertext"].map(lambda x: decode_level_1(decode_level_2(x)))
test.loc[test["difficulty"] == 3,"text"] = test.loc[test["difficulty"] == 3,"ciphertext"].map(lambda x: decode_level_1(decode_level_2(decode_level_3(x))))
test.loc[test["difficulty"] == 4,"text"] = test.loc[test["difficulty"] == 4,"ciphertext"].map(lambda x: decode_level_1(decode_level_2(decode_level_3(decode_level_4(x)))))
print(test["text"][0:10])

In [None]:
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.set_non_word_boundaries(set())

for index, text in tqdm(train.itertuples()):
    if len(text) < 3 or text == 'So.':
        continue
    keyword_processor.add_keyword(text, index)
print(len(keyword_processor))

In [None]:
def good_match(match, text):
    d = (len(text) - len(match)) // 2
    return match == text[d:d+len(match)]

def match_row(text):
    try:
        if text != text:
            return 0
        matches0 = keyword_processor.extract_keywords(text)
        matches = [x for x in matches0 if good_match(train.loc[x]['text'], text)]
        if len(matches) == 1:
            return matches[0]
        else:
            return -1
    except KeyError:
        return -1

test['result'] = test.text.map(match_row)

test.loc['ID_e0791504d','result'] = 37812
test.loc['ID_911f2b3eb','result'] = 80745
test.loc['ID_2f1f63fe7','result'] = 21918
test.loc['ID_c4ba7a73f','result'] = 26878
test.loc['ID_995b9bf74','result'] = 40357
test.loc['ID_84da29acc','result'] = 86258
test.loc['ID_1779dc0ff','result'] = 40373
test.loc['ID_017ce565e','result'] = 18145
test.loc['ID_bd1fdba60','result'] = 0
test.loc['ID_f97cdc2f4','result'] = 2129
test.loc['ID_74baa59b0','result'] = 20876
test.loc['ID_50e469eec','result'] = 28571
test.loc['ID_78a8e1c11','result'] = 92340
test.loc['ID_0414884b0','result'] = 42677

sub["index"] = test.loc[sub.index]['result']
sub.to_csv('submit-level-4.csv')

In [None]:
bad_test = test[test['result'] == -1]
good_train_index = list(test[test["result"] >= 0]["result"])
print(len(good_train_index))
bad_train = train[~train.index.isin(good_train_index)].copy()
print(bad_test.text)
print(bad_train.text)