<a href="https://colab.research.google.com/github/rajashekar/colab/blob/main/pass_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%cd /content/drive/MyDrive/Colab/password/

/content/drive/MyDrive/Colab/password


In [2]:
import os
import time

import tensorflow as tf

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu


In [3]:
data = open('data/rockyou_clean.txt').read()

In [4]:
len(data)

139504373

In [5]:
passwds = data.split("\n")

In [6]:
len(passwds)

14328850

In [7]:
passwds[100]

'alexandra'

# Vectorize the text

In [8]:
vocab = sorted(list(set(''.join(passwds))))

In [9]:
len(vocab)

95

In [10]:
char_indices = dict((c, i) for i, c in enumerate(vocab))
indices_char = dict((i, c) for i, c in enumerate(vocab))

In [11]:
max_len = max(passwds, key=len)

In [12]:
len(max_len)

50

In [13]:
print(f"Total number of passwords {len(passwds)}")
print(f"Passwords vocab size {len(vocab)}")
print(f"Max passwords length {len(max_len)}")

Total number of passwords 14328850
Passwords vocab size 95
Max passwords length 50


In [14]:
input_text = [p[:-1] for p in passwds]
target_text = [p[1:] for p in passwds]

In [15]:
print(f"{passwds[0]} {input_text[0]} {target_text[0]}")

123456 12345 23456


In [16]:
# Creating like below will blow memory

# input_data = np.zeros( (len(passwds), len(max_len), len(vocab)) ,dtype='float32')
# target_data = np.zeros( (len(passwds), len(max_len), len(vocab)) ,dtype='float32')

# nearly took 17 GB of RAM memory for 1 million records
# dataset has 14 million records
# one_m_records = np.zeros( (1000000, 50, 95) ,dtype='float32')

# clean memory
# del one_m_records


In [17]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(passwds)

In [18]:
input_tensor = tokenizer.texts_to_sequences(input_text)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='post')

In [19]:
input_tensor.shape

(14328850, 49)

In [20]:
target_tensor = tokenizer.texts_to_sequences(target_text)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post')

In [21]:
target_tensor.shape

(14328850, 49)

In [22]:
[''.join(i.split()) for i in tokenizer.sequences_to_texts(input_tensor[:5])]

['12345', '1234', '12345678', 'passwor', 'iloveyo']

In [23]:
[''.join(i.split()) for i in tokenizer.sequences_to_texts(target_tensor[:5])]

['23456', '2345', '23456789', 'assword', 'loveyou']

In [24]:
len(tokenizer.word_index)

95

# Split data into Train and Validation

In [25]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [26]:
# For performance
AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [27]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 49]), TensorShape([64, 49]))

In [28]:
vocab_size = len(tokenizer.word_index) + 1

max_length_input = input_tensor.shape[1]
max_length_output = target_tensor.shape[1]

embedding_dim = 256
rnn_units = 1024

print(f'Vocab size {vocab_size}')
print(f"Max input length {max_length_input}")
print(f"Max input length {max_length_output}")

Vocab size 96
Max input length 49
Max input length 49


# Create Model

In [29]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [30]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [31]:
# try model without training
for input_example_batch, target_example_batch in train_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 49, 96) # (batch_size, sequence_length, vocab_size)


In [32]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  24576     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  98400     
                                                                 
Total params: 4,061,280
Trainable params: 4,061,280
Non-trainable params: 0
_________________________________________________________________


In [33]:
input_example_batch

<tf.Tensor: shape=(64, 49), dtype=int32, numpy=
array([[11, 15,  7, ...,  0,  0,  0],
       [11,  2, 26, ...,  0,  0,  0],
       [13, 19, 16, ...,  0,  0,  0],
       ...,
       [21, 22, 18, ...,  0,  0,  0],
       [21,  1,  9, ...,  0,  0,  0],
       [ 2, 22, 33, ...,  0,  0,  0]], dtype=int32)>

In [34]:
tf.random.categorical(example_batch_predictions[0], num_samples=1)

<tf.Tensor: shape=(49, 1), dtype=int64, numpy=
array([[53],
       [15],
       [20],
       [55],
       [70],
       [19],
       [28],
       [84],
       [23],
       [29],
       [ 9],
       [89],
       [52],
       [48],
       [18],
       [11],
       [42],
       [61],
       [17],
       [70],
       [94],
       [76],
       [23],
       [14],
       [35],
       [40],
       [35],
       [ 9],
       [91],
       [ 0],
       [19],
       [17],
       [46],
       [72],
       [59],
       [30],
       [46],
       [82],
       [51],
       [84],
       [47],
       [47],
       [ 8],
       [88],
       [91],
       [80],
       [ 7],
       [13],
       [60]])>

In [35]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [36]:
[tf.squeeze(tf.random.categorical(pred, num_samples=1), axis=-1).numpy() for pred in example_batch_predictions] 

[array([15,  1, 85,  5, 68, 55, 44, 48, 93, 54, 58, 38, 27, 18, 24, 79, 56,
        89, 23, 53, 57, 60, 14, 74, 57, 21, 40, 94, 36, 69, 49, 38, 67, 80,
        51, 90, 54, 23,  4, 21, 41, 46, 50, 49, 84, 15, 95, 45, 29]),
 array([27,  4, 82, 69, 14,  4, 77, 66, 16, 94, 13, 20, 24, 66, 93, 31,  9,
        69, 44, 84, 22, 12, 25, 81, 43, 63, 45, 28, 58, 51, 94, 10, 90,  3,
        89, 16, 73, 66, 78, 36, 40, 78,  2, 67, 10, 41, 64,  0, 29]),
 array([81, 76, 95, 51, 77,  3, 48, 38, 89, 84,  6, 81, 80, 50, 14, 46, 95,
        55, 19, 29, 26, 40, 47, 51, 86, 54, 79, 77, 24, 20,  2, 17, 55, 25,
        74, 95, 19,  1, 17, 68, 29, 83, 18, 81, 65, 78, 81, 33, 39]),
 array([66, 16, 32, 66, 58, 88, 53, 80, 70, 75, 27, 53, 64, 32, 41, 41, 22,
        18, 35,  2, 20, 59,  8, 80, 36, 52, 84, 55, 68, 40, 82, 89, 37, 13,
        25, 25, 79, 85,  2, 81, 29, 26, 75, 66, 67,  9, 70,  1, 95]),
 array([ 7, 29, 54,  2, 72, 55, 82,  3, 81, 84,  7,  9, 13,  9, 47, 21, 13,
        26, 24, 48, 58, 48, 29, 56, 

In [37]:
tokenizer.sequences_to_texts([tf.squeeze(tf.random.categorical(pred, num_samples=1), axis=-1).numpy() for pred in example_batch_predictions] )

['} n m 1 L U ] y 7 - 5 r @ H 4 Q < k ; H z ! O ^ = F . ] [ p T y i P v ! * ! U 9 H z j n k { 0 P >',
 'v P [ 9 a m v P G c d U t * H < - T } p ) y # d ~ c # n k * = " 6 S 0 4 W r z N b \' e t   - c i ^',
 'k - ( Z i # \' f } 6 I 6 F [ O 3 q   X , " H J n g g H     ; Q 8 \\ y q ; + [ > { a \\ W } L L M ` P',
 "g 5 L b k o c S r } o x ' { ] k X , ( | ( ! ? 8 T h L R v y v = j K e Z V ; T _ # + 5 , U F E R t",
 'O { $ * L M & # e ! F | ? 7 " [ b _ 1 J L Y f 6 + 0 N ( ( X h O J X f H 6 N U _ l I [ ) A _ @   U',
 '. \\ C a A 1 ` \' U P H Q r 2 g ? [ K . | > # U # H   W ) y e h P T : [ # * & < T \\ X ! i \' c % " y',
 ') d B [ R H . b I X } X & H r T i B s i 6 d X c m W x ` @   w c M ~ P c 8 j 7 > S V x % p ; @ n t',
 "k < h x ' c - l n k F m : e e [ { 8 e u f 4 6 E i ` F W 5 0 ; - B v } l N m D < h # x K r R O t 8",
 '[ } O F B I e I % Y ^ ( r { : w 3 q P B ~ 0 y D = 1 \\ 3 d p T & @ E E L 4 1 M E y \\ { = _ D ~ M @',
 'e u D w a ~ d W k \\ * w ~ = 1 L A p $ C E z | 6 e S K h t b l a   G C

In [38]:
sampled_indices

array([33, 27, 46, 48, 39,  9, 89, 81, 39,  2, 48, 52, 20, 73, 49, 38, 30,
        2, 69, 20, 93,  3, 62, 57,  9, 20, 39,  6, 13, 86, 46, 42, 81,  7,
        2, 63, 26, 35, 65, 65, 25, 39, 89, 64, 29, 53, 75, 21, 10])

In [39]:
input_example_batch[0].numpy()

array([11, 15,  7, 30,  1, 13, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [40]:
print(f"Input: {[''.join(i.split()) for i in tokenizer.sequences_to_texts([input_example_batch[0].numpy()]) ]}")
print(f"next char prediction : {[''.join(i.split()) for i in tokenizer.sequences_to_texts([sampled_indices]) ]}")

Input: ['stoja34']
next char prediction : ["wkTDLr^'LeDH7\\_Ije/7{1@-r7L23~TN'oebAVVuL^FpY&cl"]


# Train Model

In [41]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [42]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 49, 96)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.562839, shape=(), dtype=float32)


In [43]:
tf.exp(example_batch_mean_loss).numpy()

95.855225

In [44]:
input_example_batch.numpy()

array([[11, 15,  7, ...,  0,  0,  0],
       [11,  2, 26, ...,  0,  0,  0],
       [13, 19, 16, ...,  0,  0,  0],
       ...,
       [21, 22, 18, ...,  0,  0,  0],
       [21,  1,  9, ...,  0,  0,  0],
       [ 2, 22, 33, ...,  0,  0,  0]], dtype=int32)

In [45]:
tokenizer.sequences_to_texts(input_example_batch.numpy())

['s t o j a 3 4',
 's e b b y t w',
 '3 6 4 5 7',
 '0 8 5 7 9 3 2 7 0',
 'r e b e c a 1 1',
 'k o r o m o t o 2',
 'm a y e s k i e',
 '6 6 o l 7 5 i v',
 'd i m e s t o r',
 's y a z a t t',
 'c l o 2 1 2',
 'i n t e n s e',
 'h a s a n s a p u t r',
 '0 6 7 2 1 8',
 's c h a e r r e',
 'e d i v e',
 'M O N E L L',
 '7 7 1 3 5 5',
 '1 l a m o n t',
 's t a l l 8',
 '8 6 0 9 2 2 3 5',
 '4 6 2 9 1 3 4',
 'r h o i 9',
 '0 9 2 8 7 6 6 2 4 2',
 '7 4 6 4 7 7',
 'k a t e r a t 1 9 8',
 's h i e l a 5 2 5',
 't o t y t o r o t',
 'e l l l',
 's i n g e r z',
 '1 8 0 3 8 5 7',
 'a l i s s i m',
 'r o r i e s m i t',
 'l a l o c r i p 5',
 'd i n g o b a l l',
 'e j l 1 0 0',
 's h u s h i n a l a',
 '2 n e r o c o d',
 'k e i s h a',
 'c o z m o 9',
 '@ 2 n 3 @ 1 f 1 i 1 l 9 i 9 p',
 '3 9 1 0 4',
 'e l o n e',
 'p i o s i t',
 'd o n a t o',
 'b j s j r s j r s 2',
 'b d a y 5 8 9 1 7',
 'c h a r m _ b r a t',
 's a r a h m i c h e l l',
 '1 6 0 1 1 9 9',
 '4 5 4 4 1 6 2',
 'n u t t a n o n v 

In [46]:
def bleu_score(y_true, y_pred):
  true_seq = tokenizer.sequences_to_texts(y_true.numpy())
  reference = [true_seq]
  preds = [tf.squeeze(tf.random.categorical(pred, num_samples=1), axis=-1).numpy() for pred in y_pred]
  candidate = tokenizer.sequences_to_texts(preds)
  #print(reference)
  #print(candidate)
  return sentence_bleu(reference, candidate)

In [47]:
bleu_score(input_example_batch, example_batch_predictions)

0

In [48]:
model.compile(optimizer='adam', loss=loss, metrics=bleu_score,run_eagerly=True)

# Configure Checkpoints

In [49]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

earlystopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_bleu_score',
    mode = 'max',
    verbose = 1,
    patience = 3,
    restore_best_weights = True
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    monitor = 'val_bleu_score',
    mode = 'max',
    save_best_only=True,
    verbose = 1,
    save_weights_only=True)

In [None]:
EPOCHS = 15

history = model.fit(train_dataset, 
                    validation_data=val_dataset, 
                    epochs=EPOCHS, 
                    callbacks=[checkpoint_callback, earlystopping_cb])

Epoch 1/15
  3178/179110 [..............................] - ETA: 4:10:33 - loss: 0.4605 - bleu_score: 1.1125e-04

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  return py_builtins.overload_of(f)(*args)


Epoch 1: val_bleu_score improved from -inf to 0.00027, saving model to ./training_checkpoints/ckpt_1
Epoch 2/15
Epoch 2: val_bleu_score did not improve from 0.00027
Epoch 3/15
Epoch 3: val_bleu_score did not improve from 0.00027
Epoch 4/15
 34809/179110 [====>.........................] - ETA: 3:16:24 - loss: 0.3984 - bleu_score: 1.8283e-04

# Inference

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, tokenizer, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.tokenizer = tokenizer

  #@tf.function
  def generate_one_step(self, input_chars, states=None):
    #import pdb; pdb.set_trace()
    # Convert strings to token IDs.
    # input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.tokenizer.texts_to_sequences(input_chars)
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post')

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.tokenizer.sequences_to_texts([predicted_ids.numpy()])

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, tokenizer)

In [None]:
start = time.time()
states = None
next_char = ['w']
result = [next_char]

for n in range(2):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

In [None]:
reference = [['pass']]
candidate = ['dass']
score = sentence_bleu(reference, candidate)
print(score)

# Attempts calculation

In [None]:
def find_attempts(given_pwd):
  pass_len = len(given_pwd)
  attempts = 1
  # iterate through each starting letter. 
  for l in tokenizer.word_index.keys():
    next_char = [l]
    result = [next_char]

    for n in range(pass_len-1):
      next_char, states = one_step_model.generate_one_step(next_char, states=states)
      result.append(next_char)
    
    result = tf.strings.join(result)
    pred = result[0].numpy().decode('utf-8')
    if pred == given_pwd:
      print(f"Model predicted password in {attempts}")
    else:
      attempts += 1

In [None]:
find_attempts('war')