### 1. Dataset Creation
a. Caesar Cipher (with keys 1 through 5)

b. Playfair Cipher (with keywords CRYPTOFUN and SHERLOCK)

c. Polyalphabetic Substitution Cipher (with keys DECEPTIVE and EXCITING)

In [None]:
!pip install pycipher
!pip install requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycipher
  Downloading pycipher-0.5.2.zip (45 kB)
[K     |████████████████████████████████| 45 kB 1.3 MB/s 
[?25hBuilding wheels for collected packages: pycipher
  Building wheel for pycipher (setup.py) ... [?25l[?25hdone
  Created wheel for pycipher: filename=pycipher-0.5.2-py3-none-any.whl size=30459 sha256=fb0ddaf04837bbbb97cc8ed522f63f5f6f899d73aba4e401c0ca73b36f95efc6
  Stored in directory: /root/.cache/pip/wheels/95/4a/2a/ea10423cf27ef52a0bd1c91d351a936131b3c686195e54a310
Successfully built pycipher
Installing collected packages: pycipher
Successfully installed pycipher-0.5.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pycipher import Caesar
from pycipher import Playfair
from pycipher.vigenere import Vigenere
import pandas as pd
import random 
import re
from google.colab import files


#### Generate a list of 9-letter words from *.txt files
The code can generate one 9-letter word, or a combination of words which total length = 9. Let's go with one 9-letter word.

In [None]:
uploaded = files.upload()

Saving book.txt to book.txt


In [None]:
# read words from txt files
lines = open("book.txt").read().strip()
words = lines.split()
# print(len(words))

# filter a word has 9 letters
raw_words = []
for i in words:
  i = re.sub(r"[^a-zA-Z]", "", i)
  if (len(i) == 9):
    raw_words.append(i)
# print(len(raw_words))

# # filter 3 consecutive words has length of 9
# for i in range(2, len(words)):
#   tmp = ''.join([words[i-2],words[i-1],words[i]])
#   # Remove Special Characters from a String Using re.sub()
#   tmp = re.sub(r"[^a-zA-Z]", "", tmp)
#   if (len(tmp) == 9):
#       raw_words.append(tmp)
# # print(len(raw_words))

# remove duplicate words 
raw_words = list(set(raw_words))
# lowercase words
raw_words = [x.lower() for x in raw_words]
print("Number of words has 9 letters:")
print(len(raw_words))

Number of words has 9 letters:
679


In [None]:
raw_words[:10]

['seriously',
 'instantly',
 'succeeded',
 'canadians',
 'automated',
 'cupertino',
 'providers',
 'connected',
 'education',
 'identical']

### Create 3 lists including plain_text, cipher_text, and cipher_class

In [None]:
plain_text = []
cipher_text = []
cipher_class = []


# Create Caesar examples with keys from 1 to 5
for key in range(1, 6):
  for text in raw_words:
    plain_text.append(text)
    cipher_text.append(Caesar(key).encipher(text))
    cipher_class.append(0)

print(len(plain_text))
print(len(cipher_text))
print(len(cipher_class))
print(plain_text[:5])
print(cipher_text[:5])
print(cipher_class[0:5])

3395
3395
3395
['seriously', 'instantly', 'succeeded', 'canadians', 'automated']
['TFSJPVTMZ', 'JOTUBOUMZ', 'TVDDFFEFE', 'DBOBEJBOT', 'BVUPNBUFE']
[0, 0, 0, 0, 0]


In [None]:
# Playfair Cipher (with keywords CRYPTOFUN and SHERLOCK)
keys = ["cryptofunabdeghiklmqsvwxz", "sherlockabdfgimnpqtuvwxyz"]
# keys = ["cryptofunabdeghiklmqsvwxz", "sherlockabdfgimnpqtuvwxyz","CRYPTOISFUNABDEGHKLMQVWXZ","CRYPTOUGHABDEFIKLMNQSVWXZ","REALWOKBCDFGHIMNPQSTUVXYZ"]
for key in keys:
  print(key)
  for text in raw_words:
    plain_text.append(text)
    cipher_text.append(Playfair(key).encipher(text))
    cipher_class.append(1)
print(len(plain_text))
print(len(cipher_text))
print(len(cipher_class))
print(cipher_text[3790:3795])
print(cipher_class[3790:3795])

cryptofunabdeghiklmqsvwxz
sherlockabdfgimnpqtuvwxyz
4753
4753
4753
['YDHNFKMOMP', 'WLRCCFOMPS', 'YDSBMOEGPV', 'LGHNOKWLVZ', 'EGMKWDYDGV']
[1, 1, 1, 1, 1]


In [None]:
# Polyalphabetic Substitution (Vigenere) Cipher (with keys DECEPTIVE and EXCITING)
keys = ["deceptive", "exciting"]
# keys = ["deceptive", "exciting", "dedication", "resilience", "realmadrid"]
for key in keys:
  print(key)
  for text in raw_words:
    plain_text.append(text)
    cipher_text.append(Vigenere(key).encipher(text))
    cipher_class.append(2)
print(len(plain_text))
print(len(cipher_text))
print(len(cipher_class))
print(cipher_text[5306:5311])
print(cipher_class[5306:5311])

deceptive
exciting
6111
6111
6111
['QIVMSBWOW', 'LRVIGYIXI', 'LRUYAMQIK', 'GINIIBWIW', 'IXRZDTOJZ']
[2, 2, 2, 2, 2]


### Create a pandas table from 3 lists above

In [None]:
# dictionary of lists 
dict = {'Plaintext': plain_text, 'Ciphertext': cipher_text, 'Cipher Class': cipher_class} 
    
df = pd.DataFrame(dict)

In [None]:
# df has the shape of (611x3). 
df

Unnamed: 0,Plaintext,Ciphertext,Cipher Class
0,seriously,TFSJPVTMZ,0
1,instantly,JOTUBOUMZ,0
2,succeeded,TVDDFFEFE,0
3,canadians,DBOBEJBOT,0
4,automated,BVUPNBUFE,0
...,...,...,...
6106,killfiles,OFNTYQYKW,2
6107,protected,TOQBXKGKH,2
6108,attention,EQVMGBVUR,2
6109,directory,HFTMVBBXC,2


In [None]:
# save as input.csv file
df = df.dropna()
df.to_csv("input.csv", index=False, header= None)
# df.to_csv("input1.csv", index=False)
# df.to_csv("balance_input.csv", index=False)