# Decryption with Heuristics and Machine Learning

In [206]:
import pandas as pd
import re
from collections import Counter

# read processed data
cyphertext_path = '../data/processed/countdown_23.csv'
df = pd.read_csv(cyphertext_path)
num_artists = len(df)
cypher_artists = df['cyphertext'].tolist()

# read reference artists
with open('../data/artists.txt', 'r') as file:
    artists = file.read().splitlines()
df

Unnamed: 0,cyphertext
0,ABC D
1,ABE FGHIJK
2,AJALB
3,AMNOBJI FDBKPF
4,AMFF QKJ
5,RJKS (QT FJG)
6,RUOGAKJ
7,RMNSIJ ASCQBLJK
8,RSMVC LWL TSFOXB
9,SIMYJK OJIQJRF


In [207]:
# heuritic patterns
dj_set_pattern = re.compile(r"\([A-Z0-9]{2} [A-Z0-9]{3}\)")  # DJ SET
b2b_pattern = re.compile(r"(\w+\s)([A-Z0-9])([A-Z0-9])(\2)(\s\w+)")  # B2B


# extract heuritic patterns
def extract_patterns(text):
    global df
    mappings = {}
    # DJ SET
    dj_set_matches = dj_set_pattern.findall(text)
    for match in dj_set_matches:
        cypher_dj_set = match[1:-1]
        plain_dj_set = "DJ SET"
        for c, p in zip(
            cypher_dj_set.replace(" ", ""), plain_dj_set.replace(" ", "")
        ):
            mappings[c] = p

    # B2B
    b2b_matches = b2b_pattern.findall(text)
    for match in b2b_matches:
        artists = [match[i].strip() for i in range(0, len(match), 4)]
        new_df = pd.DataFrame(artists, columns=["cyphertext"])
        df = pd.concat([df, new_df], ignore_index=True)
        cypher_b2b = match[1] + match[2]
        plain_b2b = "B2"
        for c, p in zip(cypher_b2b, plain_b2b):
            mappings[c] = p

    return mappings

In [208]:
# initial cyphertext to plaintext mappings (D, J, S, E, T, B, 2)
mappings = {}
for artist in df["cyphertext"]:
    mappings.update(extract_patterns(artist))

In [209]:
df

Unnamed: 0,cyphertext
0,ABC D
1,ABE FGHIJK
2,AJALB
3,AMNOBJI FDBKPF
4,AMFF QKJ
5,RJKS (QT FJG)
6,RUOGAKJ
7,RMNSIJ ASCQBLJK
8,RSMVC LWL TSFOXB
9,SIMYJK OJIQJRF


In [210]:
# create initial plaintext
def create_decoded(s):
    return re.sub(r'[A-Za-z0-9]', '-', s)

df["plaintext"] = df["cyphertext"].apply(create_decoded)
df.head()

Unnamed: 0,cyphertext,plaintext
0,ABC D,--- -
1,ABE FGHIJK,--- ------
2,AJALB,-----
3,AMNOBJI FDBKPF,------- ------
4,AMFF QKJ,---- ---


In [211]:
# update plaintext with mappings
def replace_decoded(row):
    updated_decoded = []
    for ct, dc in zip(row["cyphertext"], row["plaintext"]):
        if ct in mappings:
            updated_decoded.append(mappings[ct])
        else:
            updated_decoded.append(dc)
    return "".join(updated_decoded)


df["plaintext"] = df.apply(lambda row: replace_decoded(row), axis=1)
df.head()

Unnamed: 0,cyphertext,plaintext
0,ABC D,--- -
1,ABE FGHIJK,--- ST--E-
2,AJALB,-E-B-
3,AMNOBJI FDBKPF,-----E- S----S
4,AMFF QKJ,--SS D-E


In [212]:
# match plaintext pattern with artist
def match_pattern(pattern, name):
    # check if pattern and name have the same length
    if len(pattern) != len(name):
        return False

    # check if pattern and name match
    for p_char, n_char in zip(pattern, name):
        if p_char == "-":
            if n_char in mappings.values() or n_char == " ":
                return False
        elif p_char == " " and n_char != " ":
            return False
        else:
            if p_char != n_char:
                return False
    return True


# match pattern with all artists
def match_artist(pattern):
    if "(DJ SET)" in pattern:
        matches = [
            artist + " (DJ SET)"
            for artist in artists
            if match_pattern(pattern, artist + " (DJ SET)")
        ]
    else:
        matches = [
            artist for artist in artists if match_pattern(pattern, artist)
        ]
    return ", ".join(matches) if matches else None


# match all plaintext patterns with artists
def match_artists():
    df["matches"] = df["plaintext"].apply(match_artist)

In [213]:
def update_row(row_num, artist):
    # set artist for row
    df.at[row_num, "plaintext"] = artist

    cyphertext = df.at[row_num, "cyphertext"]

    for ct_char, pt_char in zip(cyphertext, artist):
        if (
            ct_char.isalnum()
            and ct_char not in mappings
            and pt_char not in mappings.values()
        ):
            mappings[ct_char] = pt_char

    df["plaintext"] = df.apply(replace_decoded, axis=1)
    match_artists()

In [214]:
match_artists()
mapping_change = True
while mapping_change:
    mapping_change = False
    for row_num, row in df.iterrows():
        if not row["matches"]:
            continue

        plaintext = row["plaintext"]
        matches = row["matches"].split(", ")
        blanks = sum(1 for char in plaintext if char == "-")
        if len(matches) == 1 and blanks / len(plaintext) < 0.35:
            artist = matches[0]
            if artist in df["plaintext"].tolist():
                continue

            update_row(row_num, artist)
            mapping_change = True
            break
df = df.head(num_artists)
df

Unnamed: 0,cyphertext,plaintext,matches
0,ABC D,MAU P,MAU P
1,ABE FGHIJK,MAX STYLER,MAX STYLER
2,AJALB,MEMBA,
3,AMNOBJI FDBKPF,MICHAEL SPARKS,
4,AMFF QKJ,MISS DRE,
5,RJKS (QT FJG),NERO (DJ SET),NERO (DJ SET)
6,RUOGAKJ,NGHTMRE,NGHTMRE
7,RMNSIJ ASCQBLJK,NICOLE MOUDABER,
8,RSMVC LWL TSFOXB,NOIZU B2B JOSHWA,
9,SIMYJK OJIQJRF,OLIVER HELDENS,OLIVER HELDENS


In [215]:
mappings = dict(sorted(mappings.items(), key=lambda item: item[1]))
mappings

{'W': '2',
 'B': 'A',
 'L': 'B',
 'N': 'C',
 'Q': 'D',
 'J': 'E',
 'Z': 'F',
 'U': 'G',
 'O': 'H',
 'M': 'I',
 'T': 'J',
 'P': 'K',
 'I': 'L',
 'A': 'M',
 'R': 'N',
 'S': 'O',
 'D': 'P',
 'K': 'R',
 'F': 'S',
 'G': 'T',
 'C': 'U',
 'Y': 'V',
 'X': 'W',
 'E': 'X',
 'H': 'Y',
 'V': 'Z'}