# Libraries

In [None]:
%pylab inline

In [None]:
import base64
import pandas as pd
from itertools import cycle

# Utils

In [None]:
class DotDict(dict):
    """
    a dictionary that supports dot notation 
    as well as dictionary access notation 
    usage: d = DotDict() or d = DotDict({'val1':'first'})
    set attributes: d.val2 = 'second' or d['val2'] = 'second'
    get attributes: d.val2 or d['val2']
    https://stackoverflow.com/a/13520518/1924403
    """
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

    def __init__(self, dct):
        for key, value in dct.items():
            if hasattr(value, 'keys'):
                value = DotDict(value)
            self[key] = value

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/sample_submission.csv')

In [None]:
test3 = test[test.difficulty==3].reset_index(drop=True)
test4 = test[test.difficulty==4].reset_index(drop=True)

## First look

#### First, we assume that we can't generate custom difficulty 3 ciphered texts from unencrypted texts, because of the nature of difficulty 3 encryption algorithm

In [None]:
test3.head(1)

In [None]:
test4.head(1)

Difficulty 4 really looks lika base64, especially since it oftens finishes with a few "="
However, after trying inside an online decoder, the letters looks shuffled (or replaced by some kind of XOR algorithm ?)

## Checking alphabet size
The longuest difficulty 4 string is on index 25054 (found in another notebook)

In [None]:
longuest = test4.iloc[25054].ciphertext
alphabet = ''.join(set(longuest))
print('Number of different chars :', len(alphabet))
print('Alphabet (sorted) :', ''.join(sorted(alphabet)))

It Really looks like base64 !

# Frequency analysis

In [None]:
def get_dictionnary(df, col):
    """ Builds a dictionnary of letters """
    dc = DotDict({})
    for ind, row in df.iterrows():
        text = row[col]
        for letter in text:
            if letter not in dc:
                dc[letter] = 0
            dc[letter] += 1
    dic = (pd.DataFrame
        .from_dict(dc, orient='index')
        .reset_index()
        .rename({'index': 'letter', 0: 'frequency'}, axis=1)
        .sort_values(by='frequency', ascending=False)
        .reset_index(drop=True))
    return dic

In [None]:
%time test3_dic = get_dictionnary(test3, 'ciphertext')
%time test4_dic = get_dictionnary(test4, 'ciphertext')

In [None]:
test3_dic.head()

In [None]:
test4_dic.head()

In [None]:
test3_dic.plot.bar(x='letter', y='frequency', figsize=(16, 6))

In [None]:
test4_dic.plot.bar(x='letter', y='frequency', figsize=(16, 6))

"i" and "l" seems slightly more frequents than others, whereas E is the rarest. ("=" is not to be used since it's just here for completion)

in base64 :
* i = 100010
* l = 100101
* E = 000100

## What a "normal" base64 on numbers distribution may actually look like
#### No padding between 3 and 4 hypothesis

In [None]:
test3['ciphertext_b64'] = test3['ciphertext'].apply(lambda x: base64.b64encode(x.encode('ascii')).decode())
test3.head()

In [None]:
%time test3b_dic = get_dictionnary(test3, 'ciphertext_b64')

In [None]:
test3b_dic.plot.bar(x='letter', y='frequency', figsize=(16, 6))

In [None]:
dic_txt = ''
for ind, row in test3b_dic.iterrows():
    dic_txt += row['letter']

alphabet = ''.join(set(dic_txt))
print('Number of different chars :', len(dic_txt))
print('Alphabet (sorted) :', ''.join(sorted(dic_txt)))

We should have a much lighter alphaber.

It can mean two things :
* either diff 3 texts are transformed before being base64-encoded
* or base64 are altered after diff 3 texts are base64-encoded

# Can it be a XOR algorithm on base64-encoded diff 3 ?
* A few resources about XOR : https://en.wikipedia.org/wiki/XOR_cipher
* A XOR algorithm could seem legit since it applies very well on base64/binary data

In [None]:
def xor_crypt_string(data, key='myprivatekey', encode=False, decode=False):
    if decode:
        data = base64.decodebytes(data).decode()
    xored = ''.join([chr(ord(x) ^ ord(y)) for (x, y) in zip(data, cycle(key))])
    if encode:
        return base64.encodebytes(xored.encode('ascii')).strip()
    return xored


secret_data = "hello"
print(xor_crypt_string(secret_data, encode=True))
print(xor_crypt_string(xor_crypt_string(secret_data, encode=True), decode=True))

In [None]:
test3['ciphertext_xor'] = test3['ciphertext_b64'].apply(lambda x: xor_crypt_string(x, encode=True).decode())
test3[['ciphertext_id', 'ciphertext_xor']].head()

In [None]:
%time test3x_dic = get_dictionnary(test3, 'ciphertext_xor')
test3x_dic.plot.bar(x='letter', y='frequency', figsize=(16, 6))

In [None]:
dic_txt = ''
for ind, row in test3x_dic.iterrows():
    dic_txt += row['letter']

alphabet = ''.join(set(dic_txt))
print('Number of different chars :', len(dic_txt))
print('Alphabet (sorted) :', ''.join(sorted(dic_txt)))

# Does the choice of the key changes the distribution ?
I tried with a few random base64 strings as keys

In [None]:
keys = ['P2o3SAgu7Evfn1baYW8m', 'rd7F5yGS8MWqxiYBnr8z', 'na5uivXZ0Wka9WKYo0Nd', 'cUbjnWc1lxFHjZ9GI9Qb', 'sLebhvqrVM7OkioVIkIa', 'wSTFqa9hAIqo2EB8leu1', 'xZ5CDvSDWW95XkiG4uqL', 'VuZcp798bOOSzu29d8ct', 'ZUboM2SxwiBkpgSZCcRQ', 'fQ2SlcNxMPtXK7atzS0F']

In [None]:
def get_metrics(key):
    print('Key : ', key)
    work = test3.copy()
    work['ciphertext_xor'] = test3['ciphertext_b64'].apply(lambda x: xor_crypt_string(x, encode=True).decode())
    work_dic = get_dictionnary(work, 'ciphertext_xor')
    dic_txt = ''
    for ind, row in work_dic.iterrows():
        dic_txt += row['letter']

    alphabet = ''.join(set(dic_txt))
    print('Number of different chars :', len(dic_txt))
    print('Alphabet (sorted) :', ''.join(sorted(dic_txt)))
    print('')

In [None]:
for key in keys:
    get_metrics(key)

### Conclusions
* whatever the key, the alphabet is always the same at the end (and it seems logical, afterwards)
* looks like XOR algorithm might be a wrong here track, or maybe in combination with anything else ?

### A last try - inverting orders between base64 and XOR

In [None]:
test3['ciphertext_transformed'] = test3['ciphertext'].apply(lambda x: xor_crypt_string(x, encode=True).decode())
test3['ciphertext_transformed'] = test3['ciphertext_transformed'].apply(lambda x: base64.b64encode(x.encode('ascii')).decode())
test3[['ciphertext_id', 'ciphertext_transformed']].head()

In [None]:
%time test3u_dic = get_dictionnary(test3, 'ciphertext_transformed')
test3u_dic.plot.bar(x='letter', y='frequency', figsize=(16, 6))

In [None]:
dic_txt = ''
for ind, row in test3u_dic.iterrows():
    dic_txt += row['letter']

alphabet = ''.join(set(dic_txt))
print('Number of different chars :', len(dic_txt))
print('Alphabet (sorted) :', ''.join(sorted(dic_txt)))
print('')

Still not matching distributions, still incomplete alphabet !
Feel free to comment ! I'm running out of ideas