# The Autoencoder Trained on GU Passphrases

Read the dataset

In [1]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/gtown-passwords/autoencoder/"

passwords = []

with open("/content/drive/MyDrive/gtown-passwords/baseline/examples/baseline_complex.txt", encoding='utf-8') as f:
    readlines = f.readlines()
    for line in readlines:
        passwords.append(line.strip())

for i in range(10):
    print(passwords[i])

Scrubbed.Frigidly.Viral.Cushy.Overlap
Extinct.Reach.Tactless.Chafe.Flashbulb
Certified@Surrender@Senior@Amperage@Oops
Shrug!Seminar!Enroll!Hankering!Detract
Diminish!Sedate!Pulsate!Steersman!Caretaker
pulverize!unbolted!comrade!humorist!humvee
Banked@Payment@Velcro@Spoon@Wimp
bonnet@improper@many@variable@prelaunch
Frosted@Paycheck@Spree@Runny@Debunk
poker!rippling!irregular!nuclear!unexpired


One-hot Character Encoding

In [3]:
from keras.utils import to_categorical
from keras.preprocessing import sequence

# Identify max password length in dataset and pad rest of the passwords such that all of them have the same length.
# Haveing same length sequences is a requirement for LSTM
PAD_CHAR = "~"
PASS_LENGTH = max([len(p) for p in passwords])

padded_passwords = []
charset = set(PAD_CHAR)               # start with the initial padding char
for p in passwords:
  padded_passwords.append(p.ljust(PASS_LENGTH, PAD_CHAR))
  charset |= set(p)                   # |= is the union set operation.

# Convert characters to integers 
vocab_size = len(charset)
char2id = dict((c, i) for i, c in enumerate(charset))

# One hot encode the passwords
encoded_passwords = [[char2id[c] for c in password] for password in padded_passwords]
one_hot_encoded = np.array([to_categorical(p, num_classes=vocab_size) for p in encoded_passwords])

print(PASS_LENGTH)
print(vocab_size)
print(np.shape(encoded_passwords))

49
56
(1000, 49)


Variational Autoencoders

In [4]:
from keras import objectives
from keras import backend as K
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Input, Lambda, Layer, Bidirectional
from keras.models import Model
import tensorflow as tf

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()


def create_lstm_vae(timesteps, layer_sizes, vocab_size, epsilon_std=1.,
                    batch_size=10):
  """
  """
  def sampling(args):
    z_mean, z_log_sigma = args
    epsilon = K.random_normal(shape=(layer_sizes[-1],),
                              mean=0., stddev=epsilon_std)
    return z_mean + K.exp(.5 * z_log_sigma) * epsilon
  
  # Create encoder model
  enc_input = Input(batch_shape=(batch_size, timesteps, vocab_size))
  x = enc_input
  for idx, layer_size in enumerate(layer_sizes):
    ret_seq = (idx != len(layer_sizes) - 1) # False for the last layer_size
    x = Bidirectional(LSTM(layer_size, return_sequences=ret_seq))(x)
  enc_output = Dense(layer_sizes[-1], activation="relu")(x)
  z_mean = Dense(layer_sizes[-1])(enc_output)
  z_log_sigma = Dense(layer_sizes[-1])(enc_output)
  z = Lambda(sampling, output_shape=(layer_sizes[-1],))([z_mean, z_log_sigma])
  encoder = Model(enc_input, z_mean, name="Encoder")

  # Create decoder model
  bottleneck_size = layer_sizes[-1]
  dec_input = Input((bottleneck_size,))
  layer = RepeatVector(timesteps)
  x = layer(z)
  _x = layer(dec_input)
  for layer_size in layer_sizes[::-1][1:]:
    layer = Bidirectional(LSTM(layer_size, return_sequences=True))
    x = layer(x)
    _x = layer(_x)
  layer =  TimeDistributed(Dense(vocab_size, activation="softmax"))
  dec_output = layer(x)
  _dec_output = layer(_x)
  decoder = Model(dec_input, _dec_output, name="Decoder")

  # connected_decoder = decoder(z_mean)

  # Create autoencoder model
  autoencoder = Model(enc_input, dec_output, name="Autoencoder")
  # autoencoder = Model(enc_input, connected_decoder, name="Autoencoder")

  # Variational autoencoder custom loss categorical entropy loss + KL loss
  def vae_loss(x, x_decoded_mean):
    xent_loss = objectives.categorical_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma), axis=-1)
    xent_loss = K.sum(xent_loss, axis=-1)
    return xent_loss + kl_loss

  autoencoder.compile(loss=vae_loss, optimizer="adam", metrics=['categorical_accuracy'], experimental_run_tf_function=False)
  return encoder, decoder, autoencoder

In [5]:
variational_encoder, variational_decoder, variational_autoencoder = create_lstm_vae(PASS_LENGTH, [16, 10, 6], vocab_size)
variational_encoder.summary()
variational_decoder.summary()

Model: "Encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(10, 49, 56)]            0         
_________________________________________________________________
bidirectional (Bidirectional (10, 49, 32)              9344      
_________________________________________________________________
bidirectional_1 (Bidirection (10, 49, 20)              3440      
_________________________________________________________________
bidirectional_2 (Bidirection (10, 12)                  1296      
_________________________________________________________________
dense (Dense)                (10, 6)                   78        
_________________________________________________________________
dense_1 (Dense)              (10, 6)                   42        
Total params: 14,200
Trainable params: 14,200
Non-trainable params: 0
_______________________________________________________

In [None]:
variational_autoencoder.fit(one_hot_encoded, one_hot_encoded, epochs=300, batch_size=10)

Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
 100/1000 [==>...........................] - ETA: 21s - loss: 124.1184 - categorical_accuracy: 0.2935

In [None]:
# Recosntruct passwords through autoencoder as vectors
reconst_passwd_vecs = variational_autoencoder.predict(one_hot_encoded, batch_size=10)
# Reverse one hot encoding to covnert passwords to strings
unpad = lambda text: text.replace(PAD_CHAR, "")
one_hot_decode = lambda one_hot_vectors: "".join([list(charset)[np.argmax(vec)] for vec in one_hot_vectors])
reconst_passwd_str = [unpad(one_hot_decode(p)) for p in reconst_passwd_vecs]



In [None]:
# Compare original vs reconstructed passwords
passwords_df = pd.DataFrame(zip(passwords["FullPassword"], reconst_passwd_str),
                            columns = ['Original Password', 'Recosntructed Password'])
passwords_df.head(10)

Unnamed: 0,Original Password,Recosntructed Password
0,Acafe2019!,Acafe2019!
1,Ahyper2019!,Ahyper2019!
2,Pleet10!,Aleet10!
3,Ababe2!,Ababe1!
4,Tbingo1234*,Abingo1234!
5,Eleet2019@,Aleet2019!
6,Tninja123!,Aninja1234
7,Aninja777*,Aninja123*
8,Ababe101*,Ababe111!
9,Tbingo2019_,Abingo2019!


Generate New Passwords and Calculate the Entropy

In [None]:
!pip install BiEntropy

from bientropy import bien, tbien
import random

sum_entropy = 0
max_entropy = float('-inf')
min_entropy = float('inf')

mu, sigma = 0, 3
new_passwords = []
entropy = []
i = 0
while i < 1000:
  latent_sample = np.array([np.random.normal(mu, sigma, 6)])
  new_password_vec = variational_decoder.predict(latent_sample)
  new_password_str = unpad(one_hot_decode(new_password_vec[0]))

  pswd_bytes = bytes(new_password_str, 'utf-8')
  e = tbien(pswd_bytes)
  
  # Threshold
  if e > 0.5:
    entropy.append(e)
    sum_entropy += e
    # update the max and the min entropy
    max_entropy = max(max_entropy, e)
    min_entropy = min(min_entropy, e)
    
    new_passwords.append(new_password_str)
    i += 1
    
avg_entropy = sum_entropy / len(new_passwords)

print("MAX entropy: " + str(max_entropy))
print("MIN entropy: " + str(min_entropy))
print("AVG entropy: " + str(avg_entropy))

new_passwords_entropy_df = pd.DataFrame(new_passwords, columns=["Password"])

# Save them into a CSV file
new_passwords_df.to_csv(path + 'data/output/gupass_vae_sample_pass.csv', sep=',')

new_passwords_df.head(10)



Unnamed: 0,Password
0,Ababe2!
1,Tleet2!
2,AAyceemannn27
3,Aleet0019!
4,Ahabe2!
5,Abingg201!
6,Abingo1219!
7,Ahackerm1n2019!
8,Abigo1
9,Abingo111!
