In [1]:
import os
import numpy as np
import tensorflow as tf 
import tensorflow.keras as keras
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dropout, Flatten, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, LeakyReLU, Input, Reshape, Conv1DTranspose, Lambda, Embedding

In [2]:
#this line only in colab
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


# Loading and preprocessing the dataset

In [3]:
folder = os.getcwd()+'/gdrive/MyDrive/'

In [None]:
os.listdir(folder)

['Google Fotos',
 'IMG_20170516_094213.jpg',
 'FB_IMG_1547999529671.jpg',
 'berlin',
 'IMG_0904.JPG',
 'IMG_1824.JPG',
 'Unbenannte Tabelle.gsheet',
 'FOTO_01.jpg',
 'SebasDocs',
 'WhatsApp Chat mit +49 1573 8880850.txt',
 'FOTO_02.jpg',
 'FOTO_03.jpg',
 'FOTO_04.jpg',
 'FB_IMG_1561892726387.jpg',
 '20200605_192245.jpg',
 '20200605_192140.jpg',
 '20200605_110251.jpg',
 '20200605_192130.jpg',
 '20200605_193403.jpg',
 'Video von .',
 'Colab Notebooks',
 '.ipynb_checkpoints',
 'haarcascade_frontalface_default.xml',
 'Bewerbungsfoto.jpg',
 'cats_and_dogs.zip',
 'face_recognition.ipynb',
 'cats_and_dogs',
 '__MACOSX',
 'Noisy_Documents',
 'denoiser.h5',
 'att_faces',
 'sample_faces',
 'doge.png',
 'landscape.jpeg',
 'landscapeDream.png',
 'landscape2.jpeg',
 'landscapeDream2.png',
 'zulpi.jpg',
 'landscape3.jpeg',
 'landscapeDream3.png',
 'landscapeDream3_2.png',
 'landscape4.jpeg',
 'landscapeDream4.png',
 '1610.00291.pdf',
 'celeb_a.zip',
 '2007.08128.pdf',
 'textos',
 'corpus',
 'text_ge

In [4]:
file = 'yahoo.txt'

In [5]:
text = open(folder+file, errors='ignore', encoding='utf-8').read().lower()
text = text.split() #this makes sure to pick up the words
text = ' '.join(text) #this takes all the letters without the separators

In [6]:
text[:100]

'@fl!pm0de@ pass steveol chotzi lb2512 scotch passwerd flipmode flipmode alden2 salmon tagoogle giant'

## Encoding the characters:

In [7]:
 #number of distinct characters
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Number of unique characters: ', vocab_size)

Number of unique characters:  73


In [8]:
char_indices = {char:chars.index(char) for char in chars}

In [9]:
char_array = np.array(chars)
char_array

array([' ', '!', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.',
       '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';',
       '=', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd',
       'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
       'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~',
       '¦', '±', '´', 'ç', 'ü', 'č', 'ł', 'ń'], dtype='<U1')

In [10]:
chars_encoded = np.array([char_indices[ch] for ch in char_array], dtype='int32')
chars_encoded

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72], dtype=int32)

In [11]:
indices_char = {idx: chars[idx] for idx in chars_encoded}

In [None]:
#encoding of the whole text:
text_encoded = np.array([char_indices[ch] for ch in text], dtype='int32')

In [12]:
import re
seqs = ' '.join(re.split('(\d+)',text))
seqs2 = seqs.split()
seq_len = len(seqs2)

In [13]:
# Pair to byte encoding:
word_list = seqs2
word_counter = {}
for word in word_list:
  if word in word_counter:
    word_counter[word] += 1
  else:
    word_counter[word] = 1
popular_words = sorted(word_counter, key = word_counter.get, reverse = True)
top_30k = popular_words[:30000]
top_30k[:10]

['1', '2', '123', '3', '4', '7', '0', '12', '5', '8']

In [14]:
#encoding the characters of all the passwords:
sequences = []
for word in top_30k:#text.split():
    sequences.append([char_indices[ch] for ch in word])

In [15]:
# pad sequences:
padded = sequence.pad_sequences(sequences)

In [16]:
max_len = len(padded[0])

In [17]:
len(sequences)

30000

In [18]:
#finally the encoded passwords are given as an array of integers:
pass_encoded = np.array(padded, dtype='int32')
print(pass_encoded[:10])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 15]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 16]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 15 16 17]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 17]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 18]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 21]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 14]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 15 16]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 19]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 22]]


## Definition of the training variables:

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
#variables for training:
X = pass_encoded/73.

In [22]:
y = np.where(X>.5, 1.0, 0.0).astype('float32')

In [26]:
y[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

In [24]:
X_train, X_test = train_test_split(y, test_size=0.2, shuffle=True, random_state=42)

In [25]:
print(X_train.shape, X_test.shape)

(24000, 20) (6000, 20)


In [46]:
print(X_train[0])


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0.]


In [None]:
from gensim.models import Word2Vec

In [None]:
Word2Vec()

##Model Training


In [28]:
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(tf.shape(z_log_var))
    return z_mean + epsilon*K.exp(z_log_var/2)

In [29]:
latent_size = 2
encoder_inputs = Input(shape=(X.shape[1]), name='Input')
#x = Embedding(30000, 32)(encoder_inputs)#, input_length=max_len
#x = Flatten()(x)
x = Dense(512, activation='relu')(encoder_inputs)
#x = Dense(64, activation='selu')(x)
z_mean = Dense(latent_size, name='Mean')(x)
z_log_var = Dense(latent_size, name='Variance')(x)
z = Lambda(sampling, name='z')([z_mean, z_log_var])
encoder = keras.Model(inputs=encoder_inputs, outputs=z, name='Encoder')
encoder.summary()

Model: "Encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input (InputLayer)              [(None, 20)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 512)          10752       Input[0][0]                      
__________________________________________________________________________________________________
Mean (Dense)                    (None, 2)            1026        dense[0][0]                      
__________________________________________________________________________________________________
Variance (Dense)                (None, 2)            1026        dense[0][0]                      
____________________________________________________________________________________________

In [30]:
decoder_inputs = Input(shape=(latent_size,), name='Input')
x = Dense(512, activation='relu')(decoder_inputs)
#x = Dense(128, activation='selu')(x)
x = Dense(20, name='Output')(x)
decoder = keras.Model(inputs=decoder_inputs, outputs=x, name='Decoder')
decoder.summary()

Model: "Decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 2)]               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               1536      
_________________________________________________________________
Output (Dense)               (None, 20)                10260     
Total params: 11,796
Trainable params: 11,796
Non-trainable params: 0
_________________________________________________________________


In [None]:
latent_size = 2
encoder_inputs = Input(shape=(X.shape[1]))
x = Embedding(30000, 32, input_length=max_len)(encoder_inputs)
x = keras.layers.LSTM(128)(x)
#x = Conv1D(64, 3, activation='relu', padding='causal',dilation_rate=1)(x)
#x = Conv1D(64, 3, activation='relu', padding='causal',dilation_rate=2)(x)
#x = Conv1D(64, 3, activation='relu', padding='causal',dilation_rate=4)(x)
#x = Conv1D(64, 3, activation='relu', padding='causal',dilation_rate=8)(x)
#x = Conv1D(64, 1)(x)
#x = Flatten()(x)
z_mean = Dense(latent_size)(x)
z_log_var = Dense(latent_size)(x)
z = Lambda(sampling)([z_mean, z_log_var])
encoder = keras.Model(inputs=encoder_inputs, outputs=z)
encoder.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 20, 32)       960000      input_8[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LSTM)                   (None, 128)          82432       embedding_3[0][0]                
__________________________________________________________________________________________________
dense_15 (Dense)                (None, 2)            258         lstm_5[0][0]                     
____________________________________________________________________________________________

In [None]:
decoder_inputs = Input(shape=(latent_size,))
x = Dense(20*32)(decoder_inputs)
x = LeakyReLU()(x)
x = Reshape((20,32))(x)
x = keras.layers.LSTM(128, return_sequences=True)(x)
#x = Conv1DTranspose(filters=32, kernel_size=3, padding='same')(x)
#x = LeakyReLU()(x)
#x = Conv1DTranspose(filters=64, kernel_size=3, padding='same')(x)
#x = LeakyReLU()(x)
#x = Flatten()(x)
x = keras.layers.TimeDistributed(Dense(20, activation='sigmoid'))(x) 
decoder = keras.Model(inputs=decoder_inputs, outputs=x)
decoder.summary()

Model: "model_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        [(None, 2)]               0         
_________________________________________________________________
dense_27 (Dense)             (None, 640)               1920      
_________________________________________________________________
leaky_re_lu_9 (LeakyReLU)    (None, 640)               0         
_________________________________________________________________
reshape_8 (Reshape)          (None, 20, 32)            0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 20, 128)           82432     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 20)            2580      
Total params: 86,932
Trainable params: 86,932
Non-trainable params: 0
______________________________________________________

In [31]:
codings=encoder(encoder_inputs)

In [32]:
reconstructions=decoder(codings)

In [33]:
vae=keras.Model(inputs=encoder_inputs, outputs=reconstructions)

In [34]:
vae.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 20)]              0         
_________________________________________________________________
Encoder (Functional)         (None, 2)                 12804     
_________________________________________________________________
Decoder (Functional)         (None, 20)                11796     
Total params: 24,600
Trainable params: 24,600
Non-trainable params: 0
_________________________________________________________________


In [53]:
print(vae(X_train)[0])

tf.Tensor(
[-55.586864  -54.60278   -55.478195  -55.411804  -52.87996   -50.747192
 -52.207176  -49.151337  -37.783752  -41.64034   -49.69031   -51.994648
 -35.117146   -8.28207     0.1315522   0.7327528   1.1695995   1.5304991
   1.622308    2.2404156], shape=(20,), dtype=float32)


In [54]:
vae_loss = -0.5*K.sum(1 + z_log_var - K.exp(z_log_var) - K.square(z_mean), axis=-1)

In [56]:
vae.add_loss(K.mean(vae_loss)/20.)

In [57]:
opt = keras.optimizers.Adam(learning_rate=1e-4)

In [58]:
vae.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True), optimizer=opt)

In [88]:
history = vae.fit(X_train, X_train, epochs=30, initial_epoch=25, shuffle=True, batch_size=64)

Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [107]:
example = tf.random.normal(shape=(100,latent_size))

In [146]:
result = tf.math.sigmoid(decoder(example))*73

In [147]:
result_encoded=result.numpy().astype('int32')

In [148]:
#encoding the characters of all the passwords:
passwords = []
for word in result_encoded:
    passwords.append([indices_char[idx] for idx in word])

In [149]:
passworten = [''.join(code).strip() for code in passwords]

In [150]:
print(result_encoded)

[[ 0  0  0  0  0  0  0  1  5 16 45 60 65 68 70 69 66 62 67 70]
 [ 0  0  0  0  0  0  0  0  1  4 19 44 57 63 68 67 65 62 66 69]
 [ 0  0  0  0  0  0  0  0  0  1  8 28 48 58 64 65 64 63 65 68]
 [ 0  0  0  0  0  0  0  0  0  0  3 18 39 51 60 62 63 63 65 68]
 [ 0  0  0  0  0  0  0  0  0  0  1 10 30 44 56 60 62 63 65 67]
 [ 0  0  0  0  0  0  0  0  0  0  0  6 22 37 51 57 61 63 64 66]
 [ 0  0  0  0  0  0  0  0  0  0  0  3 16 30 45 53 60 63 64 66]
 [ 0  0  0  0  0  0  0  0  0  0  0  2 11 23 39 50 59 63 64 65]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  7 18 33 46 58 63 64 64]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  5 13 28 42 57 63 63 64]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  3  9 22 37 55 63 63 63]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1  6 18 33 53 63 63 62]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1  4 14 28 50 61 61 60]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  2 10 22 43 57 56 55]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  1  6 14 32 48 48 47]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  3  8 19 35

In [153]:
print(' '.join(passworten))

!&2kz¦çčü±|´č !%5jw}ç´¦|±ü !*@nx~¦~}¦ç $4eqz|}}¦ç !,\jvz|}¦´ (8cqw{}~± $2\ksz}~± #-9epy}~¦ !)4_lx}~~ &/@hw}}~ $+8cu}}} !(4_s}}| !%0@p{{z #,8iwvu !(0^nnm $*5abb !%,799 !%+.. !$%% !!


In [None]:
vae.save('1d_vae.h5')



In [114]:
print(np.round((tf.math.sigmoid(vae(X_train)[:10])).numpy()))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]]


In [115]:
print(X_train[:10])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]]


In [117]:
import tensorflow_probability as tfp

In [174]:
norm = tfp.distributions.Normal(0, 1)
grid_x = norm.quantile(np.linspace(0.05, 0.95, 10)).numpy()
grid_y = norm.quantile(np.linspace(0.05, 0.95, 10)).numpy()

z_lat = np.array([ [xi,yi] for xi,yi in zip(grid_x,grid_y)])

In [175]:
print(z_lat)

[[-1.6448536  -1.6448536 ]
 [-1.0364333  -1.0364333 ]
 [-0.6744898  -0.6744898 ]
 [-0.38532048 -0.38532048]
 [-0.12566137 -0.12566137]
 [ 0.12566137  0.12566137]
 [ 0.3853204   0.3853204 ]
 [ 0.6744898   0.6744898 ]
 [ 1.0364333   1.0364333 ]
 [ 1.6448536   1.6448536 ]]


In [179]:
print((tf.math.sigmoid(decoder(z_lat))*73).numpy().astype('int32'))

[[ 0  0  0  0  0  0  0  1  5 16 45 60 65 68 70 69 66 62 67 70]
 [ 0  0  0  0  0  0  0  0  0  1  7 27 47 57 64 64 64 63 65 68]
 [ 0  0  0  0  0  0  0  0  0  0  1  9 28 43 55 59 62 63 64 67]
 [ 0  0  0  0  0  0  0  0  0  0  0  3 14 28 43 52 60 63 64 65]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  6 15 31 44 58 63 64 64]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  2  7 20 35 54 63 63 62]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  3 12 24 46 59 58 57]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  9 22 38 39 38]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  4 10 13 13]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1]]


In [166]:
norm = tfp.distributions.Normal(0, 1)
  
grid_x = norm.quantile(np.linspace(0.05, 0.95, 20))
grid_y = norm.quantile(np.linspace(0.05, 0.95, 20))

for i, yi in enumerate(grid_x):
  for j, xi in enumerate(grid_y):
    z_lat = np.array([[xi, yi]])
    x_decoded = decoder(z_lat)
      #digit = tf.reshape(x_decoded[0], (digit_size, digit_size))
      #image[i * digit_size: (i + 1) * digit_size,
            #j * digit_size: (j + 1) * digit_size] = digit.numpy()


In [173]:
np.round(tf.math.sigmoid(x_decoded[0])*73)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 1.], dtype=float32)