<a href="https://colab.research.google.com/github/rajashekar/CharityML/blob/master/train_password_pytorch_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtext==0.12.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import re
from torchtext.data.functional import numericalize_tokens_from_iterator
from torch.utils.data import DataLoader
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split


In [3]:
%cd /content/drive/MyDrive/Colab/password/

/content/drive/MyDrive/Colab/password


In [4]:
data = open('data/passwords_db.txt',).read()

In [5]:
len(data)

178313552

In [6]:
passwds = data.split("\n")

In [7]:
len(passwds)

18308617

# Build vocab

In [8]:
vocab = sorted(list(set(''.join(passwds))))

In [9]:
len(vocab)

95

In [10]:
char_indices = dict((c, i) for i, c in enumerate(vocab))
indices_char = dict((i, c) for i, c in enumerate(vocab))

In [11]:
indices_char[0]

' '

In [12]:
char_indices[' ']

0

In [13]:
char_indices[indices_char[0]] = len(vocab)
indices_char[len(vocab)] = indices_char[0]

indices_char[0] = '<<pad>>'
char_indices['<<pad>>'] = 0

In [14]:
char_indices[' ']

95

In [15]:
max_len = max(passwds, key=len)

In [16]:
print(f"Total number of passwords {len(passwds)}")
print(f"Passwords vocab size {len(char_indices)}")
print(f"Max passwords length {len(max_len)}")

Total number of passwords 18308617
Passwords vocab size 96
Max passwords length 50


In [17]:
passwds_train, passwds_rem = train_test_split(passwds, test_size=0.3, shuffle=True)


In [18]:
print(len(passwds_train), len(passwds_rem))

12816031 5492586


In [19]:
passwds_val, passwds_test = train_test_split(passwds_rem, test_size=0.5, shuffle=True)

In [20]:
print(len(passwds_val), len(passwds_test))

2746293 2746293


In [21]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(samples, vocab, seq_len, batch_size):
  ids_iter = numericalize_tokens_from_iterator(vocab, [re.findall('.',s) for s in samples])

  inputs = []
  targets = [] 
  for ids in ids_iter:
    vector = [num for num in ids]

    for i in range(len(vector)):
      if i+seq_len >= len(vector):
        break
      inputs.append(vector[i:i+seq_len])
      targets.append(vector[i+seq_len])

  tensor_data = TensorDataset(torch.LongTensor(inputs), torch.LongTensor(targets))
  dataloader = DataLoader(tensor_data, batch_size=batch_size, shuffle=True)

  # return a dataloader
  return dataloader

In [22]:
def index_to_chars(input):
  target = []
  for i in input:
    if i.item() == 0:
      break
    target.append(indices_char[i.item()])
  
  return target

In [23]:
def index_to_chars_bleu(input):
  target = []
  for i in input:
    if i.item() == char_indices[' ']:
      target.append('<s>')
    else:
      target.append(indices_char[i.item()])
  
  return target

In [24]:
batch_size=32
seq_len=3

In [25]:
train_loader = batch_data(passwds_train, char_indices, seq_len, batch_size)
val_loader = batch_data(passwds_val, char_indices, seq_len, batch_size)
test_loader = batch_data(passwds_test, char_indices, seq_len, batch_size)

In [26]:
input_test_batch, target_test_batch = next(iter(test_loader))

In [27]:
input_test_batch.shape

torch.Size([32, 3])

In [28]:
input_test_batch[0].shape

torch.Size([3])

In [29]:
target_test_batch.shape

torch.Size([32])

In [30]:
# Check for a GPU
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

# Model

In [39]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(RNN, self).__init__()
        # TODO: Implement function
        # set class variables
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # define model layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=dropout, batch_first=True)
        # dropout layer                    
        # self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
    
    
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        # TODO: Implement function   
        batch_size = nn_input.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)

        #stach up lstm outputs 
        out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully-connected layer
        # came to know that removing dropout might help converge faster
        # out = self.dropout(out)
        out = self.fc(out)
        # reshape to be batch_size first
        output = out.view(batch_size, -1, self.output_size)

        # get last batch
        out = output[:, -1]
        # return one batch of output word scores and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Implement function
        # initialize hidden state with zero weights, and move to GPU if available
        weight = next(self.parameters()).data

        if train_on_gpu:
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                    weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [40]:
learning_rate = 0.001
vocab_size = len(indices_char)
output_size = vocab_size
embedding_dim = 256
hidden_dim = 256
# number of RNN layers
n_layers = 3

rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)

if train_on_gpu:
    rnn.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [41]:
rnn

RNN(
  (embedding): Embedding(96, 256)
  (lstm): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=256, out_features=96, bias=True)
)

In [97]:
hidden = rnn.init_hidden(batch_size)
hidden = tuple([each.data for each in hidden])

y_pred, hidden = rnn(input_test_batch.cuda(), hidden)

In [98]:
y_pred.shape

torch.Size([32, 96])

In [99]:
actual = index_to_chars(target_test_batch)
predictions = index_to_chars(torch.argmax(y_pred, dim=1))

In [100]:
actual

['e',
 'e',
 'g',
 't',
 '3',
 'm',
 'a',
 '0',
 'v',
 '6',
 'Z',
 'R',
 '0',
 's',
 'L',
 '-',
 '7',
 'm',
 'a',
 'e',
 'i',
 '1',
 '5',
 't',
 'r',
 'b',
 'n',
 '7',
 'c',
 'v',
 'g',
 '4']

In [101]:
predictions

['=',
 '=',
 'G',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=',
 'G',
 'G',
 '=',
 'G',
 'G',
 'G',
 '=',
 '=',
 '=',
 '6',
 'G',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=',
 '=']

# BLEU Score

In [47]:
from torchtext.data.metrics import bleu_score

# if all are equal
candidate_corpus = ['a', 'b', 'c']
references_corpus = ['a', 'b', 'c']
bleu_score(candidate_corpus, references_corpus, max_n=1, weights=[0.25])

1.0

In [127]:
# if all are different
candidate_corpus = ['a', 'b', 'c']
references_corpus = ['d', 'e', 'd']
bleu_score(candidate_corpus, references_corpus, max_n=1, weights=[0.25])

0.0

In [129]:
# if some are matching
candidate_corpus = ['a', 'b', 'c']
references_corpus = ['a', 'b', 'd']
bleu_score(candidate_corpus, references_corpus, max_n=1, weights=[0.25])

0.9036020040512085

In [141]:
# if space is there then bleu score is failing with
# IndexError: index 1 is out of bounds for dimension 0 with size 1
candidate_corpus = ['a', 'b', '<s>']
references_corpus = ['a', 'b', 'd']
bleu_score(candidate_corpus, references_corpus, max_n=1, weights=[0.25])

0.7952707409858704

In [130]:
# bleu score on first batch of test dataset

bleu_score(actual, predictions, max_n=1, weights=[0.25])

0.0

In [45]:
def calculate_bleu_score(rnn, loader):
  rnn.eval()
  scores = []

  for batch_i, (inp_batch, target_batch) in enumerate(loader, 1):
    # make sure you iterate over completely full batches, only
    n_batches = len(loader.dataset)//batch_size
    if(batch_i > n_batches):
        break

    hidden = rnn.init_hidden(batch_size)
    hidden = tuple([each.data for each in hidden])
    y_pred, hidden = rnn(inp_batch.cuda(), hidden)
    
    actual = index_to_chars_bleu(target_batch)
    predictions = index_to_chars_bleu(torch.argmax(y_pred, dim=1))
    
    scores.append(bleu_score(actual, predictions, max_n=1, weights=[0.25]))

  return np.average(scores)

In [151]:
# average bleu score on complete test dataset
calculate_bleu_score(rnn, test_loader)

0.005894420136069319

# Training

In [42]:
batch_losses = []
n_epochs = 5
clip_norm = 5

# put model in train mode
rnn.train()

for epoch_i in range(1, n_epochs + 1):
  hidden = rnn.init_hidden(batch_size)
  hidden = tuple([each.data for each in hidden])
  for batch_i, (input_batch, target_batch) in enumerate(train_loader, 1):
    # make sure you iterate over completely full batches, only
    n_batches = len(train_loader.dataset)//batch_size
    if(batch_i > n_batches):
        break
    
    if train_on_gpu:
      inp, target = input_batch.cuda(), target_batch.cuda()
    
    rnn.zero_grad()

    output, hidden = rnn(inp, hidden)
    loss = criterion(output, target)
    loss.backward()
    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    nn.utils.clip_grad_norm_(rnn.parameters(), clip_norm)
    hidden = tuple([each.detach() for each in hidden])

    optimizer.step()

    batch_losses.append(loss.item())
    if batch_i % 100000 == 0:
      print({ 'batch': batch_i, 'loss': np.average(batch_losses) })

  print("============================================")
  print({ 'epoch': epoch_i, 'loss': np.average(batch_losses) })
  print("============================================")

{'batch': 100000, 'loss': 2.874912832775116}
{'batch': 200000, 'loss': 2.849412460744381}
{'batch': 300000, 'loss': 2.8363398210398354}
{'batch': 400000, 'loss': 2.828697928137183}
{'batch': 500000, 'loss': 2.8237013783507345}
{'batch': 600000, 'loss': 2.820523172910611}
{'batch': 700000, 'loss': 2.818390861775875}
{'batch': 800000, 'loss': 2.8169036285007}
{'batch': 900000, 'loss': 2.815867653551499}
{'batch': 1000000, 'loss': 2.815198055323601}
{'batch': 1100000, 'loss': 2.81477873716311}
{'batch': 1200000, 'loss': 2.8144712843035657}
{'batch': 1300000, 'loss': 2.815755772085465}
{'batch': 1400000, 'loss': 2.815711309582421}
{'batch': 1500000, 'loss': 2.8159097863376936}
{'batch': 1600000, 'loss': 2.8160758887846025}
{'batch': 1700000, 'loss': 2.8164108416267704}
{'batch': 1800000, 'loss': 2.8166396071907545}
{'batch': 1900000, 'loss': 2.8170534612492513}
{'batch': 2000000, 'loss': 2.817533056913197}
{'batch': 2100000, 'loss': 2.818049958797523}
{'batch': 2200000, 'loss': 2.818530801

KeyboardInterrupt: ignored

In [43]:
torch.save(rnn, 'model.pt')

In [48]:
calculate_bleu_score(rnn, test_loader)

0.6569461258165805

In [64]:
def generate(rnn, prime_id, seq_len, pad_value, predict_len):
  rnn.eval()
  current_seq = np.full((1, seq_len), pad_value)
  current_seq[-1][-1] = prime_id
  predicted = [indices_char[prime_id]]
  for _ in range(predict_len):
      current_seq = torch.LongTensor(current_seq).cuda()

      # initialize the hidden state
      hidden = rnn.init_hidden(current_seq.size(0))
      hidden = tuple([each.data for each in hidden])
      
      # get the output of the rnn
      output, _ = rnn(current_seq, hidden)

      # get the next char probabilities
      p = F.softmax(output, dim=1).data
      if(train_on_gpu):
          p = p.cpu() # move to cpu

      # use top_k sampling to get the index of the next char
      top_k = 5
      p, top_i = p.topk(top_k)
      top_i = top_i.numpy().squeeze()

      # select the likely next word index with some element of randomness
      p = p.numpy().squeeze()
      char_i = np.random.choice(top_i, p=p/p.sum())

      # retrieve that word from the dictionary
      char = indices_char[char_i]
      predicted.append(char)

      if(train_on_gpu):
          current_seq = current_seq.cpu() # move to cpu
      # the generated word becomes the next "current sequence" and the cycle can continue
      if train_on_gpu:
          current_seq = current_seq.cpu()
      current_seq = np.roll(current_seq, -1, 1)
      current_seq[-1][-1] = char_i
    
  return ''.join(predicted)

In [66]:
generate(rnn, char_indices['P'], seq_len, char_indices['<<pad>>'], 5)

'PAN143'

In [71]:
given_pass = 'pass'

attempts = 1

while True:
  start_char = given_pass[0]
  predicted_pass = generate(rnn, char_indices[start_char], seq_len, char_indices['<<pad>>'], len(given_pass) - 1)
  print(predicted_pass)
  if predicted_pass == given_pass:
    print("Took {attempts} attempts to find password {given_pass}")
    break
  attempts += 1

patr
pers
peni
peed
pang
pelo
pand
prad
pee1
prop
pite
pand
popo
pooh
pong
patr
pack
piem
ping
pete
pelo
perm
prin
pate
peng
pand
pass
Took {attempts} attempts to find password {given_pass}
