In [1]:
import nltk
import numpy as np
import os
import random
import sys

from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


# Dataset import and char to index conversion

In [2]:
data = np.loadtxt('Top24Million-WPA-probable-v2.txt', dtype='U',delimiter='\n', max_rows=5000000) # TODO: check max_rows
train, test = train_test_split(data, test_size=0.95, random_state=42)
test = set(test)  # to speed up the search
text = ' '.join(train) # long string of passwords with space in between for training
print(f'train:{len(train)}, test:{len(test)}')
print('char count', len(text))

train:249994, test:4741903
char count 2751819


In [3]:
# 
chars = sorted(list(set(text)))
print('Total Number of Unique Characters:', len(chars))
char_idx = dict((c, i) for i, c in enumerate(chars)) # Character to index

idx_char = dict((i, c) for i, c in enumerate(chars)) # Index to Character

Total Number of Unique Characters: 94


# Data Preprocessing

In [4]:
"""Data preprocessing:
    1) convert the string into seq_length long strings with (step) stride size
    2) convert the input and output to one hot encode"""

seq_length = 25 # Number of characters considered 
step = 3 # Stide of our window
sentences = []
next_chars = []

# Rading the text in terms of sequence of characters
# Extract only 'seq_length' characters every time
for i in range(0, len(text) - seq_length, step):
    sentences.append(text[i: i + seq_length])
    # The character just after the sequence is the label
    next_chars.append(text[i + seq_length]) 
print('nb sequences:', len(sentences))

print('Vectorization...')
# Initializing Tensor (training data)
x = np.zeros((len(sentences), seq_length, len(chars)), dtype=np.bool) 
# Initializing Output that holds next character (label)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool) 
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        # Populate Tensor Input
        x[i, t, char_idx[char]] = 1 
    # Populate y with the character just after the sequence
    y[i, char_idx[next_chars[i]]] = 1

nb sequences: 917265
Vectorization...


# Helper functions

In [5]:
epoch_hit_rate=[[],[]]

def on_epoch_end(epoch, _):
    """After each epoch, compute hit_rate per epoch"""
    hit_rate = pass_hit_rate(model, 5000)
    epoch_hit_rate[0].append(epoch)
    epoch_hit_rate[1].append(hit_rate)


def sample(preds, temperature=1.0):
    """Perform Temperature Sampling when picking the char from softmax list"""
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature 
    exp_preds = np.exp(preds)
    # Softmax of predictions
    preds = exp_preds / np.sum(exp_preds) 
    # Sample a single characters, with probabilities defined in `preds`
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)

   
    

    
def pass_predict(model, char_count=400):
    """password prediction using the model, repeat for char_count characters"""
    start_index = random.randint(0, len(text) - seq_length - 1)
    output_text = ''
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        sentence = text[start_index: start_index + seq_length]
        
        for i in range(char_count):
            x_pred = np.zeros((1, seq_length, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_idx[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            # Generate next character
            next_index = sample(preds, diversity) 
            next_char = idx_char[next_index]
            output_text += next_char
            # Append character to generated sequence
            sentence = sentence[1:] + next_char

    return output_text.split(' ')  # pass list
    
    
def pass_hit_rate(model, char_count):
    """Compute hit_rate based on predict password and test list"""
    output = pass_predict(model, char_count)
    output = list(filter(lambda x: len(x)>2, output))
    hit_rate = 0
    for pas in output:
        if pas in test:
            hit_rate +=1

    print('predict size', len(output))
    print(f'hit_count: {hit_rate}')
    print(f'hit_rate: {np.round(100*hit_rate/len(output), decimals=2)}%')
    return hit_rate/len(output)
    
    

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
#checkpointer = ModelCheckpoint(filepath='passweights.hdf5', verbose=1, save_best_only=True)

# Creating the LSTM model

In [None]:
print('Building model...')
hidden_size = 512
batch = 512
epochs=4
# Model: LSTM and dense layer withsoftmax to predict char
model = Sequential()
model.add(LSTM(hidden_size, input_shape=(seq_length, len(chars))))
model.add(Dense(len(chars), activation='softmax')) 

optimizer_new = RMSprop() # Optimzes learning rate and adaptive

model.compile(loss='categorical_crossentropy', optimizer=optimizer_new, metrics=['accuracy']) 

model_history = model.fit(x, y, batch_size=batch, epochs=epochs,validation_split=0.2, callbacks=[print_callback])
model.save_weights('passweights.hdf5')


Building model...
Train on 733812 samples, validate on 183453 samples
Epoch 1/20
predict size 1905
hit_count: 144
hit_rate: 7.56%
Epoch 2/20
predict size 1929
hit_count: 198
hit_rate: 10.26%
Epoch 3/20
predict size 1728
hit_count: 141
hit_rate: 8.16%
Epoch 4/20
109056/733812 [===>..........................] - ETA: 1:55 - loss: 2.2885 - accuracy: 0.3201

In [37]:
# more training
#model.load_weights("passweights.hdf5")
#model.fit(x, y, batch_size=32, epochs=4)
#model.save_weights('passweights.hdf5')

# Testing

In [6]:
pass_hit_rate(model, 5000)

# PLOT

In [None]:
filename = f'data{len(data)}_batch{batch}_epochs{epochs}_'
#print(model_history.history.keys())
# summarize history for accuracy
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy', fontsize=16)
plt.ylabel('accuracy', fontsize=14)
plt.xlabel('epoch', fontsize=14)
plt.legend(['train', 'test'], loc='upper left')
plt.savefig(filename+'model_accuracy.pdf', format='pdf')
plt.show()

# summarize history for loss
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss', fontsize=16)
plt.ylabel('loss', fontsize=14)
plt.xlabel('epoch', fontsize=14)
plt.legend(['train', 'test'], loc='upper left')
plt.savefig(filename+'model_loss.pdf', format='pdf')
plt.show()


# plot hit rate
plt.plot(epoch_hit_rate[0], epoch_hit_rate[1])
plt.title('model hit rate', fontsize=16)
plt.ylabel('hit_rate', fontsize=14)
plt.xlabel('epoch', fontsize=14)
plt.savefig(filename+'model_hit_rate.pdf', format='pdf')
plt.show()
