### 1. Dataset Creation
a. Caesar Cipher (with keys 1 through 5)

b. Playfair Cipher (with keywords CRYPTOFUN and SHERLOCK)

c. Polyalphabetic Substitution Cipher (with keys DECEPTIVE and EXCITING)

In [2]:
!pip install pycipher
!pip install requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycipher
  Downloading pycipher-0.5.2.zip (45 kB)
[K     |████████████████████████████████| 45 kB 1.9 MB/s 
[?25hBuilding wheels for collected packages: pycipher
  Building wheel for pycipher (setup.py) ... [?25l[?25hdone
  Created wheel for pycipher: filename=pycipher-0.5.2-py3-none-any.whl size=30459 sha256=e13fc42917b1f8a5f99e7dfa52cfe1773751993999dd7b6495a1f0f3b99ff1dd
  Stored in directory: /root/.cache/pip/wheels/95/4a/2a/ea10423cf27ef52a0bd1c91d351a936131b3c686195e54a310
Successfully built pycipher
Installing collected packages: pycipher
Successfully installed pycipher-0.5.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from pycipher import Caesar
from pycipher import Playfair
from pycipher.vigenere import Vigenere
import pandas as pd
import random 
import re
from google.colab import files


#### Generate a list of 9-letter words from *.txt files

In [4]:
uploaded = files.upload()

Saving book.txt to book.txt


In [27]:
# read words from txt files
lines = open("book.txt").read().strip()
words = lines.split()
# print(len(words))

# filter a word has 9 letters
raw_words = []
for i in words:
  i = re.sub(r"[^a-zA-Z]", "", i)
  if (len(i) == 9):
    raw_words.append(i)
# print(len(raw_words))

# # filter 3 consecutive words has length of 9
# for i in range(2, len(words)):
#   tmp = ''.join([words[i-2],words[i-1],words[i]])
#   # Remove Special Characters from a String Using re.sub()
#   tmp = re.sub(r"[^a-zA-Z]", "", tmp)
#   if (len(tmp) == 9):
#       raw_words.append(tmp)
# # print(len(raw_words))

# remove duplicate words 
raw_words = list(set(raw_words))
# lowercase words
raw_words = [x.lower() for x in raw_words]
print("Number of words has 9 letters:")
print(len(raw_words))

Number of words has 9 letters:
679


In [28]:
raw_words[:10]

['appearing',
 'royalties',
 'impatient',
 'newsgroup',
 'classical',
 'character',
 'equipment',
 'addresses',
 'assistant',
 'drwxrwsrx']

### Create 3 lists including plain_text, cipher_text, and cipher_class

In [29]:
plain_text = []
cipher_text = []
cipher_class = []


# Create Caesar examples with keys from 1 to 5
for key in range(1, 6):
  for text in raw_words:
    plain_text.append(text)
    cipher_text.append(Caesar(key).encipher(text))
    cipher_class.append(0)

print(len(plain_text))
print(len(cipher_text))
print(len(cipher_class))
print(plain_text[:5])
print(cipher_text[:5])
print(cipher_class[0:5])

3395
3395
3395
['appearing', 'royalties', 'impatient', 'newsgroup', 'classical']
['BQQFBSJOH', 'SPZBMUJFT', 'JNQBUJFOU', 'OFXTHSPVQ', 'DMBTTJDBM']
[0, 0, 0, 0, 0]


In [30]:
# Playfair Cipher (with keywords CRYPTOFUN and SHERLOCK)
keys = ["cryptofunabdeghiklmqsvwxz", "sherlockabdfgimnpqtuvwxyz"]
for key in keys:
  print(key)
  for text in raw_words:
    plain_text.append(text)
    cipher_text.append(Playfair(key).encipher(text))
    cipher_class.append(1)
print(len(plain_text))
print(len(cipher_text))
print(len(cipher_class))
print(cipher_text[3790:3795])
print(cipher_class[3790:3795])

cryptofunabdeghiklmqsvwxz
sherlockabdfgimnpqtuvwxyz
4753
4753
4753
['GWBYAYKSGW', 'GWYMFCMOMP', 'HAAHRIDYVZ', 'UGSYNIDYVZ', 'OBOXSCYHGV']
[1, 1, 1, 1, 1]


In [31]:
# Polyalphabetic Substitution (Vigenere) Cipher (with keys DECEPTIVE and EXCITING)
keys = ["deceptive", "exciting"]
for key in keys:
  print(key)
  for text in raw_words:
    plain_text.append(text)
    cipher_text.append(Vigenere(key).encipher(text))
    cipher_class.append(2)
print(len(plain_text))
print(len(cipher_text))
print(len(cipher_class))
print(cipher_text[5306:5311])
print(cipher_class[5306:5311])

deceptive
exciting
6111
6111
6111
['LRVIGYIXI', 'SVCVXXVZX', 'VLCVTPIMI', 'UIIYATZGC', 'EYNPTMQIW']
[2, 2, 2, 2, 2]


In [32]:
# dictionary of lists 
dict = {'plain_text': plain_text, 'cipher_text': cipher_text, 'cipher_class': cipher_class} 
    
df = pd.DataFrame(dict)

In [33]:
df

Unnamed: 0,plain_text,cipher_text,cipher_class
0,appearing,BQQFBSJOH,0
1,royalties,SPZBMUJFT,0
2,impatient,JNQBUJFOU,0
3,newsgroup,OFXTHSPVQ,0
4,classical,DMBTTJDBM,0
...,...,...,...
6106,wildcards,AFNLVIEJW,2
6107,greenwich,KOGMGEVIL,2
6108,tradition,XOCLBBVUR,2
6109,librarian,PFDZTZVGR,2


In [34]:
# save as input.csv file
df = df.dropna()
# df.to_csv("input.csv", index=False, header= None)
df.to_csv("input.csv", index=False)