### Part I: Load datasets and just do some high-level exploration

In [None]:
# MODULES TO IMPORT
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import seaborn as sns

In [None]:
# CONFIGURE PANDAS OUTPUT
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 9999)
pd.set_option('display.max_colwidth', 9999)
pd.set_option('display.html.table_schema', True)

# CUSTOM DATA FRAME STYLE CAN APPLY BY CALLING
# df.head().style.set_table_styles(tom_df_style)
tom_df_style = [
    dict(selector="td", props=[('font-family','Consolas'), ('font-size','8pt'), ('vertical-align','top'), ('text-align','left'), ('white-space', 'pre')]),
    dict(selector="th", props=[('font-family','Consolas'), ('font-size','8pt'), ('vertical-align','top'), ('text-align','left'), ('white-space', 'pre')])
]

In [None]:
# LOAD IN RAW (T)EXT DATASET AND ADD TEXT LENGTH VARIABLE
dft = pd.read_csv("/kaggle/input/train.csv")
dft["text_length"] = dft["text"].str.len()
dft.head(7).style.set_table_styles(tom_df_style)

In [None]:
# MOST ARE <100 CHARACTERS
sns.distplot(dft["text_length"], kde=False, rug=True);

In [None]:
# LOAD IN (E)NCRYPTED DATASET AND ADD TEXT LENGTH VARIABLE
dfe = pd.read_csv("/kaggle/input/test.csv")
dfe["ciphertext_length"] = dfe["ciphertext"].str.len()
dfe.head(7).style.set_table_styles(tom_df_style)

In [None]:
# BREAK OUT THE ENCRYPTED DATA SET INTO SEPARATE ONES BASED ON DIFFICULTY LEVEL, SINCE IT SOUNDS LIKE THEY'RE APPLIED IN SUCCESSION (1 -> 2 -> 3 ->4), SO LET'S JUST WORRY ABOUT TRYING TO "CRACK" LEVEL 1 FIRST
dfe1 = dfe[dfe["difficulty"] == 1].copy()
dfe2 = dfe[dfe["difficulty"] == 2].copy()
dfe3 = dfe[dfe["difficulty"] == 3].copy()
dfe4 = dfe[dfe["difficulty"] == 4].copy()

In [None]:
# LEVEL 1 APPEARS TO "JUST" BE SCRAMBLED IN SOME WAY
dfe1.sort_values(["ciphertext_length","ciphertext"], ascending=[True, True]).head(10).style.set_table_styles(tom_df_style)

In [None]:
# LEVEL 2 APPEARS TO MAYBE BE SCRAMBLED AGAIN
dfe2.sort_values(["ciphertext_length","ciphertext"], ascending=[True, True]).head(7).style.set_table_styles(tom_df_style)

In [None]:
# LEVEL 3 FURTHER ENCRYPTED INTO SOME NUMERIC CODING
dfe3.sort_values(["ciphertext_length","ciphertext"], ascending=[True, True]).head(7).style.set_table_styles(tom_df_style)

In [None]:
# LEVEL 4 LOOKS LIKE FURTHER MORE ADVANCED BYTE ENCODING
dfe4.sort_values(["ciphertext_length","ciphertext"], ascending=[True, True]).head(7).style.set_table_styles(tom_df_style)

<h2>Part II: Attempt to shrink the problem down</h2>

<p style="font-size: 10pt"><b>What's known:</b><br/>
1) It was given that <i>"Every document in the dataset has been padded to the next hundred characters (95->100, 213->300) with random (in-alphabet) characters, then encrypted"</i><br/>
-and-<br/>
2) While we don't know how each training record (plain text) ends up getting encrypted, we <i><b>do</b></i> know that each test record (encrypted text) comes from a training record<br/>
<br/>
<b>Therefore:</b><br/>
We should focus on the very long encypted (level 1) records and see what we can glean from there; hoepfully, it leads to something "unique"!<br/>
</p>

In [None]:
# WHAT'S THE DISTRUBITION OF THE ENCRYPTED STRING LENGTHS?
dfe1.groupby(["ciphertext_length"]).agg("count")

In [None]:
# FIND THE TWO LEVEL 1 ENCRYPTED PHRASES 400+ CHARACTERS WIDE
dfe1[dfe1["ciphertext_length"] >= 400].sort_values(["ciphertext"]).style.set_table_styles(tom_df_style)

In [None]:
# FIND WHAT INPUT PHRASES ARE AT LEAST 400+ CHARACTERS WIDE; WE KNOW THE ABOVE TWO ENCRYPTED PHRASES *MUST* CORRELATE TO ONE OF THESE INPUT PHRASES (NOTE: THERE ARE THREE OF THEM)
dft[(dft["text_length"] >= 400) & (dft["text_length"] <= 500)].sort_values(["text"]).style.set_table_styles(tom_df_style)

<div><b><u>Some observations:</u></b><br/>
We can see that the first encrypted phrase <i>(ciphertext_id=ID_31bd699f6)</i> is "too short" to possibly go with any of these input phrases, so we know the one encrypted message <i>(ID_6100247c5)</i> <b>must</b> have come from one of these 3 input phrases<br/>
<br/>
By close introspection, scanning the "words" from left-to-right, we can infer the 2nd one <i>(plaintext_id=ID_f000cad17)</i> is the one, all the punctuation and spaces <i>(including any double spaces)</i> just seem to magically align!<br/>
<br/>
This has given us some "good hope" that:<br/><br/>
1) Punctuation and spacing appears to be fully preserved ✔<br/><br/>
2) Case appears to be preserved (lower vs upper) ✔<br/><br/>
3) Indvidual word lengths appear to be preserved ✔<br/><br/>
<br/>
Unfortunately, there's some not-so-good things, it looks like:<br/><br/>
4) It's unclear if we have the start of some sort of dictionary for 30+ words, as some words <i>(i.e. "the")</i> appear to have varying ouput values<i>(i.e. "xwd", "flt" and/or "ssi")</i><br/><br/>
5) It does not appear that individual letters are just 1-for-1; for example, sometimes input "t" ➝ encrypted "s", as in "<b style="color: blue">t</b>wo ➝ <b style="color: blue">s</b>is", and at other times "f", as in "grea<b style="color: blue">t</b> ➝ rvty<b style="color: blue">f</b>"<br/><br/>
6) The very beginning of the ciphertext seem to violate found assumptions 2-4 ... <span style="font-family: monospace">"gQUXDo<span style="color: blue">Bshspv</span>:" ➝ "<span style="color: blue">Porter:</span>"</span>
</div>

In [None]:
# LET'S TAKE AN EVEN CLOSER LOOK HERE
example_encrypted_text = dfe1[dfe1["ciphertext_id"]=="ID_6100247c5"]["ciphertext"].values[0]
example_plain_text     = dft[dft["plaintext_id"]=="ID_f000cad17"]["text"].values[0]

# HAVE TO "PAD" THIS PLAIN TEXT MESSAGE WITH 6 SPACES TO ACCOUNT FOR FINDING #6 ABOVE
example_plain_text = "      " + example_plain_text
pd.DataFrame([example_plain_text, example_encrypted_text], columns=["Text"], index=["ptext", "etext"]).style.set_table_styles(tom_df_style)

In [None]:
# WHAT OTHER ENCRYPTED MESSAGES CONTAIN "YSHEAPA" (AKA, "NORFOLK" ?)
dfe1[(dfe1["ciphertext"].str.contains("YSHEAPA"))].sort_values(["ciphertext"]).style.set_table_styles(tom_df_style)

In [None]:
# WE SEE A "YSHEAPA: Ssi" ABOVE, IS THERE A "NORFOLK: The " HERE? YEP! 
dft[dft["text"].str.contains("NORFOLK: The ")].sort_values(["text"]).style.set_table_styles(tom_df_style)

In [None]:
# WITH THE HELP OF THE APOSTROPHE SEEN (WHICH WE KNOW ARE "PRESERVED") IN "The Cardinal's" 
# WE CAN PROBABLY GUESS THESE GO TOGETHER (ALTHOUGH: SHIFTED 27 SPACES INSTEAD OF 6..HMMM)
example_encrypted_text = dfe1[dfe1["ciphertext_id"]=="ID_9bf75d21c"]["ciphertext"].values[0]
example_plain_text     = dft[dft["plaintext_id"]=="ID_9b8e655fe"]["text"].values[0]
example_plain_text = (" " * 27) + example_plain_text
pd.DataFrame([example_plain_text, example_encrypted_text], columns=["Text"], index=["ptext", "etext"]).style.set_table_styles(tom_df_style)

In [None]:
# IT WOULD APPEAR THAT THE PLAIN TEXT MESSAGES ARE FIRST "CENTERED" BEFORE BEING ENCRYTED
list_text = list(dft["text"])
list_lenx = [int(math.ceil(x / 100.0)) * 100 for x in dft["text_length"]]

dft["text_adj"] = list(map(lambda t, x: t.center(x,"`"), list_text, list_lenx))
dft.head().style.set_table_styles(tom_df_style)

In [None]:
# LETS TRY THIS AGAIN
example_encrypted_text = dfe1[dfe1["ciphertext_id"]=="ID_9bf75d21c"]["ciphertext"].values[0]
example_plain_text     = dft[dft["plaintext_id"]=="ID_9b8e655fe"]["text_adj"].values[0]
pd.DataFrame([example_plain_text, example_encrypted_text], columns=["Text"], index=["ptext", "etext"]).style.set_table_styles(tom_df_style)

In [None]:
# PERHAPS ADJUSTED SENTENCE PATTERNS ARE "UNIQUE" AND CAN PROVIDE US WITH A UNIQUE "SENTENCE SIGNATURE" (SINCE DON'T YET HAVE A FULL CHARACTER-FOR-CHARACTER MAPPING)
dft["text_pattern"] = dft["text_adj"].str.replace("`", "`", regex=False)
dft["text_pattern"] = dft["text_pattern"].str.replace("[A-Z]", "X", regex=True)
dft["text_pattern"] = dft["text_pattern"].str.replace("[a-z]", "x", regex=True)
dft.head().style.set_table_styles(tom_df_style)

In [None]:
# MOST ARE UNIQUE! THIS MIGHT BE PROMISING!
tmp = pd.DataFrame(dft.groupby(["text_pattern"]).size(), columns=["N"])
sns.distplot(tmp["N"], kde=False, rug=True);

In [None]:
# LET'S APPLY THE SAME TRANSFORMATION TO THE LEVEL 1 ENCRYPTED TEXT
dfe1["text_pattern"] = dfe1["ciphertext"].str.replace("`", "`", regex=False)
dfe1["text_pattern"] = dfe1["text_pattern"].str.replace("[A-Z]", "X", regex=True)
dfe1["text_pattern"] = dfe1["text_pattern"].str.replace("[a-z]", "x", regex=True)
dfe1.head().style.set_table_styles(tom_df_style)

It looks like we can also probably assume numbers are preserved too!

In [None]:
# HERE'S ONE EXAMPLE FOUND
tmp1 = dfe1[dfe1["ciphertext_id"]=="ID_d649ebbb2"][["ciphertext","text_pattern"]].rename(columns={"ciphertext":"text"})
tmp2 = dft[dft["plaintext_id"]=="ID_97bea3ff9"][["text_adj","text_pattern"]].rename(columns={"text_adj":"text"})
pd.concat([tmp1,tmp2], ignore_index=True).style.set_table_styles(tom_df_style)

In [None]:
# HERE IS A MESSY (AND VERY SLOW!) ATTEMPT AT "AUTOMATING" THE MATCHING OF SOME ENCRYPTED PHRASES BACK TO THE "BEST GUESS" MATCHING ORIGINAL MESSAGE BASED ON SENTENCE STRUCTURE
results = {}
dft_data = list(dft[["plaintext_id","text_length","text_pattern",]].to_records(index=False))

#eids = ["ID_6100247c5","ID_9bf75d21c","ID_fb906e3a4","ID_93aa4509f","ID_d649ebbb2","ID_4a6fc1ea9","ID_c85d54d74","ID_ac57b8817"]
#for i, dfe1_row in dfe1[dfe1["ciphertext_id"].isin(eids)].iterrows():

# GRAB SOME ARBITRARY RECORDS (LIKE JUST 30 OR SO)
for i, dfe1_row in dfe1.sample(30, random_state=123).iterrows():    
    (ciphertext_id, cipher_text_pattern) = (dfe1_row.ciphertext_id, dfe1_row.text_pattern)
    print(ciphertext_id)
    print("   ETEXT:" + cipher_text_pattern)
    
    results[ciphertext_id] = ""
    maxlength = 0
    for d in dft_data:
        (plaintext_id, plain_text_length, plain_text_pattern) = (d[0], d[1], d[2])
        boxsize = int(math.ceil(plain_text_length / 100.0)) * 100
        startpos = int((boxsize - plain_text_length) / 2)
        if cipher_text_pattern[startpos:startpos+plain_text_length] == plain_text_pattern[startpos:startpos+plain_text_length]:
            if plain_text_length > maxlength:
                results[ciphertext_id] = plaintext_id
                maxlength = plain_text_length
                print("   PTEXT:" + plain_text_pattern)

In [None]:
results

In [None]:
cids = []
tids = []
for k, v in results.items():
    cids.append(k)
    tids.append(v)

In [None]:
# NOW WE CAN THUMB THROUGH SOME MATCHES FOUND AND TRY AND GLEAN SOME MORE INSIGHTS INTO THE MAPPINGS
k = 3

(cid, tid) = (cids[k],tids[k])
tmp1 = dft[dft["plaintext_id"]==tid][["text_adj"]].rename(columns={"text_adj":"text"})
tmp2 = dfe1[dfe1["ciphertext_id"]==cid][["ciphertext"]].rename(columns={"ciphertext":"text"})
pd.concat([tmp1,tmp2], ignore_index=True).style.set_table_styles(tom_df_style)

## MAYBE HAVE ENOUGH TO TRY MAPPING CHARACTERS NOW

In [None]:
# FOR MATCH RESULTS FOUND, PAIR PLAIN => ENCYPTED MESSAGES
df_bridge = pd.DataFrame({"TID":tids, "EID":cids})
df_bridge = df_bridge.merge(dft, how="inner", left_on="TID", right_on="plaintext_id")[["EID","plaintext_id","text","text_length"]]
df_bridge = df_bridge.merge(dfe1, how="inner", left_on="EID", right_on="ciphertext_id")[["plaintext_id","text","text_length","ciphertext_id","ciphertext"]]
df_bridge.head(10).style.set_table_styles(tom_df_style)

In [None]:
# LET'S "CLIP" THE CIPHERTEXT TO OVERLAY KEEP THOSE CHARACTERS THAT ACTUALLY ALGN TO THE PLAIN TEXT |....XXXXXXX....| SIMILAR AS HAD DONE ABOVE
list_text     = list(df_bridge["ciphertext"])
list_lenx     = list(df_bridge["text_length"])
list_boxsize  = [int(math.ceil(L / 100.0)) * 100 for L in list_lenx]
list_startpos = list(map(lambda B, L: int((B - L) / 2), list_boxsize, list_lenx))

df_bridge["ciphertext_adj"] = list(map(lambda T, P, L: T[P:P+L], list_text, list_startpos, list_lenx))
df_chars = df_bridge[["text","ciphertext_adj"]]
df_chars.head(10)

In [None]:
# NOW WE CAN ATTEMPT TO MAP ALL char_in => char_out FOR PHRASES INCLUDED IN OUR RESULTS/BRIDGE DATASET
list_chars_in = []
list_chars_out = []
for i in range(0, len(df_chars)):
    chars_in  = list(df_chars.iloc[i,0])
    chars_out = list(df_chars.iloc[i,1])
    assert (len(chars_out) == len(chars_in))
    list_chars_in  = list_chars_in + chars_in
    list_chars_out = list_chars_out + chars_out
    
df_char_map = pd.DataFrame({"in":list_chars_in, "out":list_chars_out})
df_char_map.head(10).style.set_table_styles(tom_df_style)

In [None]:
# LET'S DO A FULL CROSSTAB TO SAY WHAT MAPS TO WHAT AND HOW OFTEN
df_char_cross = pd.pivot_table(pd.DataFrame(df_char_map.groupby(["in","out"]).size(), columns=["N"]), values="N", index=["out"], columns=["in"], aggfunc=np.sum, fill_value=0)
df_char_cross.head(10).style.set_table_styles(tom_df_style)

In [None]:
# LEt'S PLOT THIS
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(df_char_cross, linewidths=1, cmap=sns.light_palette("red"), vmin=0, vmax=5, ax=ax)
ax.xaxis.set_ticks_position("top")

### FROM THE ABOVE PICTURE, THERE'S CLEARLY SOME CYCLICAL PATTERN, BUT WHAT IS IT, EXACTLY??!

In [None]:
df_chars.head(25).style.set_table_styles(tom_df_style)

Picking out a couple the phrases and comparing letter distances forward from input charracter, see this (imperfect) pattern:

<pre style="font-size: 8pt">| B| u| t|  | c| e| r| t| a| i| n|  | i| s| s| u| e|  | s| t| r| o| k| e| s|  | m| u| s| t|  | a| r| b| i| t| r| a| t| e| :|
| F| k| s|  | n| i| h| s| l| m| d|  | h| e| w| k| d|  | e| x| h| n| v| i| i|  | l| g| w| j|  | y| d| f| x| s| d| e| j| d| :|
|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|
| 4|16|25|  |11| 4|16|25|11| 4|16|  |25|12| 4|16|25|  |12| 4|16|25|11| 4|16|  |25|12| 4|16|  |24|12| 4|15|25|12| 4|16|25|  |


| N| O| R| F| O| L| K| :|  | T| h| e|  | c| a| r| d| i| n| a| l| '| s|  | m| a| l| i| c| e|  | a| n| d|  | h| i| s|  | p| o| t| e| n| c| y|
| Y| S| H| E| A| P| A| :|  | S| s| i|  | r| y| d| h| x| m| l| p| '| i|  | l| l| p| x| b| p|  | e| d| c|  | s| m| i|  | o| a| x| t| m| n| d|
|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|
|11| 4|16|25|12| 4|16|  |  |25|11| 4|  |15|24|12| 4|15|25|11| 4|  |16|  |25|11| 4|15|25|11|  | 4|16|25|  |11| 4|16|  |25|12| 4|15|25|11| 5|</pre>

**notes:**<br/><br/>
1) The pattern <i>tends</i> to be 4-16-25-11 but sometimes is 5(+1) instead of 4; 15(-1) instead of 16; 24(-1) instead of 25; and 11(-1) instead of 12

2) The first phrase started +4 but 2nd phrase started +11 ... how to know what to even start with?

In [None]:
# ZOOM IN ON JUST THE lowercare LETTERS AND WE SEE THAT PERHAPS "z" CHARACTERS ARE SIMPLY LEFT ALONE (LIKE PUNCTUATION, CHRACATERS, AND NUMBERS APPEAR TO BE)
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(df_char_cross.iloc[32:,28:], linewidths=1, cmap=sns.light_palette("red"), vmin=0, vmax=5, ax=ax)
ax.xaxis.set_ticks_position("top")

## NAIVE CHARACTER TRANSLATOR USING THE CYCLICAL +4 -> +16 > +25 -> +11 CONVERSION CYCLE AS WE MOVE LEFT-TO-RIGHT THRU LETTERS THAT REQUIRE ENCRYPTED

In [None]:
# FIRST ROW CONTAINS ORIGINAL LETTERS WE WANT TO TRANSLATE AND SUBSEQUENT ROWS REPRESENT 1st, 2nd, 3rd, and 4th SHIFT
translation = [
"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy",
"EFGHIJKLMNOPQRSTUVWXYABCDefghijklmnopqrstuvwxyabcd",
"PQRSTUVWXYABCDEFGHIJKLMNOpqrstuvwxyabcdefghijklmno",
"YABCDEFGHIJKLMNOPQRSTUVWXyabcdefghijklmnopqrstuvwx",
"LMNOPQRSTUVWXYABCDEFGHIJKlmnopqrstuvwxyabcdefghijk",
]
    
# GIVEN AN INPUT CHARACTER AND ITS SEQUENCE MARKER, RETURN ENCYPTED CHARACTER (IF RELEVANT) AND INCREASE SEQUENCE INDICTATOR VALUE (IF RELEVANT)
def decrypt_letter(input_char, s):

    k = s % 4    
    retvals = (input_char, 0)

    if (k == 0):
        k = 4
    
    i = translation[k].find(input_char)
    
    if (i >= 0):
        out_char = translation[0][i]
        retvals = (out_char, 1)

    return retvals

In [None]:
# TEST SOME
print(decrypt_letter("F",1))
print(decrypt_letter("[",53))

## ANOTHER TWIST WAS SOME PHRASES DON'T ALWAYS START WITH FIRST +4 TRANSLATION, SO ADD PARAMETER TO FINE-TUNE ADJUST START-POS COUNTER

In [None]:
# NOW BUILD FUNCTION TO DECRYPT ENTIRE PHRASES
def decrypt_phrase(input_phrase, xstart):
    
    return_chars = []
    x = xstart
    
    for c in input_phrase:
        (out_char, i) = decrypt_letter(c, x)
        x = x + i
        return_chars.append(out_char)
    
    return "".join(return_chars)

In [None]:
# Fks nihslmd hewkd exhnvii lgwj ydfxsdejd:
# But certain issue strokes must arbitrate:
decrypt_phrase("Fks nihslmd hewkd exhnvii lgwj ydfxsdejd:", 1)

In [None]:
# YSHEAPA: Ssi rydhxmlp'i llpxbp edc smi oaxtmnd
# NORFOLK: The cardinal's malice and his potency
decrypt_phrase("YSHEAPA: Ssi rydhxmlp'i llpxbp edc smi oaxtmnd", 4)

In [None]:
# TRIAL AND ERROR SEEMS TO SUGGEST IF WE SEND IN THE FULL CIPHERTEXT, THEN WE CAN TURN THE KNOB TO "2" AND THINGS COME INTO FOCUS (IN THE MIDDLE): 
dfe1.head(15)["ciphertext"].apply(lambda x : decrypt_phrase(x,2))

## AT LONG LAST! LET'S TRY AND DECRYPT THINGS

In [None]:
%%time

# HERE IS A MESSY (AND VERY SLOW!) ATTEMPT AT DECRYPTING LEVEL 1 PHRASES
full_results = []

dfe1_data = list(dfe1[["ciphertext_id","ciphertext"]].to_records(index=False))
dft_id    = list(dft["plaintext_id"])
dft_text  = list(dft["text"])
dft_index = list(dft["index"])

list_lenx     = list(dft["text_length"])
list_boxsize  = [int(math.ceil(L / 100.0)) * 100 for L in list_lenx]
list_startpos = list(map(lambda B, L: int((B - L) / 2), list_boxsize, list_lenx))

# GRAB SOME ARBITRARY RECORDS (LIKE JUST 100 OR SO)
i = 0
N = len(dfe1_data)
for row_enc in dfe1_data:

    i = i + 1
    if i % 1000 == 0:
        print(str(i) + " [" + f"{(i/N):0.2%}" + " ] records processed...")
        
    (ciphertext_id, ciphertext) = (row_enc[0], row_enc[1])
    deciphered_text = decrypt_phrase(ciphertext, 2)
   
    for j in range(0,len(dft_text)):
        (plaintext_id, plain_text, plain_text_index, startpos, plain_text_length) = (dft_id[j], dft_text[j], dft_index[j], list_startpos[j], list_lenx[j])
        if deciphered_text[startpos:startpos+plain_text_length] == plain_text:
            full_results.append([ciphertext_id, ciphertext, plain_text, plaintext_id, plain_text_index])
            break

In [None]:
df_results = pd.DataFrame(full_results, columns=["ciphertext_id","ciphertext","plain_text","plaintext_id","plain_text_index"])
df_results.head().style.set_table_styles(tom_df_style)

In [None]:
# SETUP SUBMISSIONS FILE
df_submissions = dfe[["ciphertext_id"]].merge(df_results[["ciphertext_id","plain_text_index"]], how="left", left_on="ciphertext_id", right_on="ciphertext_id")
df_submissions["plain_text_index"] = df_submissions["plain_text_index"].fillna(0).astype(int)
df_submissions.rename(columns={"plain_text_index": "index"}, inplace=True)
df_submissions.head(10)

In [None]:
# EXPORT SUBMISSION FILE
df_submissions.to_csv("ct3_submission.csv", index=None)