In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rockyouwith-similarity-model-info/lengthCounts.pickle
/kaggle/input/rockyouwith-similarity-model-info/negative_passwords.dic
/kaggle/input/rockyouwith-similarity-model-info/rockyou-test.txt
/kaggle/input/rockyouwith-similarity-model-info/stringCounts.pickle
/kaggle/input/rockyouwith-similarity-model-info/rockyou-train.txt


In [2]:
from __future__ import print_function
from tqdm import tqdm
import sys
import torch, torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
import mmap
import pickle as pic
from time import *
from functools import reduce
import random
import time
import logging
from timeit import default_timer as timer
%matplotlib inline

In [3]:
directory = '/kaggle/input/rockyouwith-similarity-model-info/'

# **Similarity Model-Trie**

In [None]:
class TrieNode:
    # Trie node class
    def __init__(self):
        self.children = {}
        self.count = 0

class Trie:      
    # Trie data structure class
    def __init__(self):
        self.root = TrieNode()
        self.stringCounts = {}
        self.lengthCounts = {}
  
    def insert(self,key):
        cur = self.root
        length = len(key)
        for idx,c in enumerate(key):
            if idx+1 not in self.lengthCounts:
                self.lengthCounts[idx+1] = 0
            if c not in cur.children:
                cur.children[c] = TrieNode()
            self.lengthCounts[idx+1] += 1
            cur.count += 1
            cur = cur.children[c]
        cur.count += 1
    
    def recurseStringCounts(self,cur,string):
        if string!='':
            self.stringCounts[string] = cur.count
        if cur.children:
            for key in cur.children:
                self.recurseStringCounts(cur.children[key], string+chr(key))

    def constructStringCounts(self):
        cur = self.root
        string = ''
        self.stringCounts = {}
        self.recurseStringCounts(cur,string)
        
    def similarity(self, key, keyLength ):  
        # Gets similarity score of a string
        score = 0
        string = ''
        for idx, c in enumerate(key):
            string += key[idx]
            if string not in self.stringCounts:
                break
            length = len(string)
            lengthRatio = length/keyLength
            stringLengthCountRatio = self.stringCounts[string]/self.lengthCounts[length]
            score = max(score, stringLengthCountRatio*lengthRatio+self.similarity(key[idx+1:],keyLength))
        return score
similarityModel = Trie()

In [None]:
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines
    fp.close()

In [None]:
if os.path.exists(directory+"lengthCounts.pickle"):
    similarityModel.lengthCounts = pic.load(open(directory+"lengthCounts.pickle", "rb" ))
else:
    file = open(directory+'rockyou-train.txt','rb')
    for line in tqdm(file, total=get_num_lines(directory+'rockyou-train.txt')):
        password = line
        for start in range(len(password)):
            for end in range(start+1,len(password)):
                similarityModel.insert(password[start:end])
    file.close()
    file = open(directory+"lengthCounts.pickle", "wb" )
    pic.dump(similarityModel.lengthCounts,file)
    file.close()
    
if os.path.exists(directory+"stringCounts.pickle"):
    similarityModel.stringCounts = pic.load(open(directory+"stringCounts.pickle", "rb" ))
else:
    similarityModel.constructStringCounts()
    file = open(directory+"stringCounts.pickle", "wb" )
    pic.dump(similarityModel.stringCounts,file)
    file.close()

# **Negative Samples Generator**

In [None]:
def getRandomSample(passSizeRange):
    passSize = random.randint(passSizeRange[0],passSizeRange[1])
    password = ''
    symbolicCharacter = list(range(32,ord('0')))+list(range(ord('9')+1,ord('A')))+list(range(ord('Z')+1,ord('a')))+list(range(ord('z')+1,128))
    symCharCt = len(symbolicCharacter)
    for __ in range(passSize):
        charType = random.randint(1,4)
        if charType==1: # lower case alphabet
            password+=chr(random.randint(ord('a'),ord('z')))
        elif charType==2:
            password+=chr(random.randint(ord('A'),ord('Z')))
        elif charType==3:
            password+=chr(random.randint(ord('0'),ord('9')))
        else:
            
            password+=chr(random.choice(symbolicCharacter))
    return password

In [None]:
def negativeSamples(threshold, samplesize,passSizeRange, chr_to_idx):
    passwords = set()
    for __ in range(samplesize):
        testPass = getRandomSample(passSizeRange)
        while testPass not in passwords and similarityModel.similarity(testPass, len(testPass))>=threshold:
            testPass = getRandomSample(passSizeRange)
        passwords.add(testPass)
    return torch.LongTensor([[chr_to_idx[c] for c in password]+[chr_to_idx[' ']]*(passSizeRange[1]-len(password)) for password in passwords])

# **Getting Pre-Generated Passwords**

In [None]:
passwords = list(pic.load(open( directory+"negative_passwords.dic", "rb" )))
print('Password length:',len(passwords))

# **CPC Model**

In [4]:
class CDCK2(nn.Module):
    def __init__(self, passwords, vocab_size, embedding_dim, seq_len, threshold, negativeCounts, chr_to_idx, device):

        super(CDCK2, self).__init__()
        # Pre generated Negative Passwords
        self.passwords = passwords
        
        # Device
        self.device = device
        
        # Embedding parameters
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.chr_to_idx = chr_to_idx
        
        # negative sample parameters
        self.seq_len = seq_len # =16 
        self.threshold = threshold        
        self.negativeCounts = negativeCounts
        
        # timestep calculation
        self.timestep = self.seq_len*self.embedding_dim//8 # 16*8/8 = 16, here 8 is a downsampling factor = 8
        self.t_samples = ((self.negativeCounts+1)*self.seq_len*self.embedding_dim)//8 - self.timestep-1
        
        # Hidden State
        self.hidden_state = torch.zeros(1,1, 256).to(device)
        
        # Layers
        self.embedding = nn.Embedding(self.vocab_size,self.embedding_dim)
        self.encoder = nn.Sequential( # downsampling factor = 8
            nn.Conv1d(1, 512, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Conv1d(512, 512, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Conv1d(512, 512, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True)
        )
        self.gru = nn.GRU(512, 256, num_layers=1, bidirectional=False, batch_first=True)
        self.Wk  = nn.ModuleList([nn.Linear(256, 512) for i in range(self.timestep)])
        self.softmax  = nn.Softmax()
        self.lsoftmax = nn.LogSoftmax()

        def _weights_init(m):
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # initialize gru
        for layer_p in self.gru._all_weights:
            for p in layer_p:
                if 'weight' in p:
                    nn.init.kaiming_normal_(self.gru.__getattr__(p), mode='fan_out', nonlinearity='relu')

        self.apply(_weights_init)

    def init_hidden(self, batch_size, use_gpu=True):
        if use_gpu: return torch.zeros(1,1, 256).cuda()
        else: return torch.zeros(1, 1, 256)
    
    def forward(self, x):        
        batch = x.size()[0]
        # input 512 x 1 x 16
        emb = self.embedding(x) # 512 x 1 x 16 x 8
        emb = emb.reshape(batch,1, 1*self.seq_len*self.embedding_dim) # 512 x 1 x 128
        z = self.encoder(emb) # 512 x 512 x 16
        forward_seq = z.transpose(1,2) # 512 x 16 x 512
        forward_seq = forward_seq.reshape(1,batch*self.seq_len,512) # 1 x(512*16) x 512
        
        output, hidden = self.gru(forward_seq, self.hidden_state) # 512 x 16 x 256, 1 x(512*16) x 256
        
        output = output.reshape(batch, self.seq_len, 256) # 512 x 16 x 256
        c_t = output[:,-1,:].view(batch, 256) # 512 x 256
        pred = torch.empty((self.timestep,batch,512)).float().to(self.device) # e.g. size 16 x 512 x 512
        for i in np.arange(0, self.timestep):
            linear = self.Wk[i]
            pred[i] = linear(c_t) # Wk*c_t e.g. size 512 x 512
        nce = 0 # average over timestep and batch
        for i in np.arange(0, self.timestep):
            total = torch.mm(z[:,:,i], torch.transpose(pred[i],0,1)) # e.g. (512 x 512) x ( 512 x 512) = 512 x 512,  
            correct = torch.sum(torch.eq(torch.argmax(self.softmax(total), dim=0), torch.arange(0, batch).to(self.device))) # correct is a tensor
            nce += torch.sum(torch.diag(self.lsoftmax(total))) # nce is a tensor
        
        nce /= -1.*batch*self.timestep
        accuracy = 1.*correct.item()/batch
        return accuracy, nce, hidden
        
        

    def predict(self, x, hidden):
        batch = x.size()[0]
        # input sequence is N*C*L, e.g. 64 x 1 x 16
        
        emb = self.embedding(x) # 64 x 1 x 16 x 8
        emb = emb.reshape(batch,1, 1*self.seq_len*self.embedding_dim) # 64 x 1 x 128
        # sequence is N*C*L, e.g. 32*1*128
        
        z = self.encoder(x) # 64 x 512 x 16
        
        return z.reshape(batch, 512*self.timestep)

# **Training and validation functions**

In [25]:
def trainCPC(model, device, train_loader, optimizer, epoch, batch_size):
    model.train()
    data_len = len(train_loader.dataset)
    n_batches = data_len//batch_size
    log_interval = n_batches//20
    for batch_idx, data in enumerate(train_loader):
        if len(data['xVect'])==batch_size:
            data = data['xVect'].long().unsqueeze(1).to(device) # add channel dimension
            optimizer.zero_grad()
            acc, loss, hidden = model(data)
            loss.backward()
            optimizer.step()
#             lr = optimizer.update_learning_rate()
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tAccuracy: {:.4f}\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader),  acc, loss.item()))
            model.hidden_state = hidden.detach()

In [32]:
def validationCPC(model, device, data_loader, batch_size):
    model.eval()
    total_loss = 0
    total_acc  = 0 

    with torch.no_grad():
        for batch_idx, data in enumerate(data_loader):
            if len(data['xVect'])==batch_size:
                data = data['xVect'].long().unsqueeze(1).to(device) # add channel dimension
                #hidden = model.init_hidden(len(data), use_gpu=True)
                acc, loss, __ = model(data)
                total_loss += len(data) * loss 
                total_acc  += len(data) * acc

    total_loss /= len(data_loader.dataset) # average loss
    total_acc  /= len(data_loader.dataset) # average acc

    print('===> Validation set: Average loss: {:.4f}\tAccuracy: {:.4f}\n'.format(
                total_loss, total_acc))

    return total_acc, total_loss

# **Logging function**

# Model training scheduling Optimizer

In [7]:
def setup_logs(save_dir, run_name):
    # initialize logger
    logger = logging.getLogger("cdc")
    logger.setLevel(logging.INFO)
    # create the logging file handler
    log_file = os.path.join(save_dir, run_name + ".log")
    fh = logging.FileHandler(log_file)
    # create the logging console handler
    ch = logging.StreamHandler()
    # format
    formatter = logging.Formatter("%(asctime)s - %(message)s")
    fh.setFormatter(formatter)
    # add handlers to logger object
    logger.addHandler(fh)
    logger.addHandler(ch)
    return logger

In [8]:
class ScheduledOptim(object):
    """A simple wrapper class for learning rate scheduling"""

    def __init__(self, optimizer, n_warmup_steps):
        self.optimizer = optimizer
        self.d_model = 128 
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0 
        self.delta = 1

    def state_dict(self):
        self.optimizer.state_dict()

    def step(self):
        """Step by the inner optimizer"""
        self.optimizer.step()

    def zero_grad(self):
        """Zero out the gradients by the inner optimizer"""
        self.optimizer.zero_grad()

    def increase_delta(self):
        self.delta *= 2

    def update_learning_rate(self):
        """Learning rate scheduling per step"""

        self.n_current_steps += self.delta
        new_lr = np.power(self.d_model, -0.5) * np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = new_lr
        return new_lr

# **Snapshot**

In [9]:
def snapshot(dir_path, run_name, state):
    snapshot_file = os.path.join(dir_path,
                    run_name + '-model_best.pth')
    
    torch.save(state, snapshot_file)
#     logger.info("Snapshot saved to {}\n".format(snapshot_file))

# **Dataset**

In [10]:
class PasswordDataset(Dataset):
    """Password dataset."""

    def __init__(self,  source='./Dataset/', dataset_name='rockyou', dataset_type='train', maxlength=16, transform=None):
        """
        Args:
            source (string): Path to the benign dataset.
            epsilons (list): List of all epsilon values for FGSM attack
            transform (callable, optional): Optional transform to be applied
                on a sample.
            init_model (DLS_Model) : The model 
        """

        # character to index map, considering all printable ASCII - (32-127) 
        
        self.chr_to_idx = {chr(x):(x-32) for x in range(32,128)}
        self.idx_to_chr = {(x-32):chr(x) for x in range(32,128)}
        self.path = source+dataset_name+'-'+dataset_type+".txt"
        with open(self.path, 'r',encoding='latin1') as f:
            self.passwords = tuple(filter(lambda y: reduce(lambda a,b:a&b, [32<=ord(c)<=127 for c in y]) ,map(lambda x:x.strip()[:maxlength],f.readlines())))

    def __len__(self):
        return len(self.passwords)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return {'passString':self.passwords[idx],'xVect':torch.LongTensor([self.chr_to_idx[c] for c in (self.passwords[idx]+' '*(maxlength-len(self.passwords[idx])))])}

# **CPC Variables**

In [22]:
batch_size = 256
logging_dir = '/kaggle/working' #+'logging/CPC/'
vocab_size = 127-32+1
embedding_dim = 8
seq_len = 16
threshold = 0.1
negativeCounts = 7
epochs = 25
maxlength=16
run_name = "cdc" + time.strftime("-%Y-%m-%d_%H_%M_%S")
print(run_name)

cdc-2021-05-13_06_13_27


# **getting train and test Dataset**

In [12]:
trainDataset = PasswordDataset(source=directory, dataset_type='train', maxlength=maxlength)
trainDataset[0]

{'passString': 'antonio25',
 'xVect': tensor([65, 78, 84, 79, 78, 73, 79, 18, 21,  0,  0,  0,  0,  0,  0,  0])}

In [None]:
testDataset = PasswordDataset(source=direcotry, dataset_type='test',maxlength=maxlength)
testDataset[0]

# **Split by train-validation**

In [13]:
train_val_size = 0.15
train_val_idx, remaining_idx = train_test_split(list(range(len(trainDataset))), test_size=1-train_val_size)
train_val_set = Subset(trainDataset, train_val_idx)
val_split = 0.33 
train_idx, val_idx = train_test_split(list(range(len(train_val_set))), test_size=val_split)
training_set = Subset(train_val_set, train_idx)
validation_set = Subset(train_val_set, val_idx)
len(training_set),len(validation_set),training_set[0],validation_set[0]

(2141698,
 1054867,
 {'passString': 'mikehe',
  'xVect': tensor([77, 73, 75, 69, 72, 69,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])},
 {'passString': 'ADRIANNA',
  'xVect': tensor([33, 36, 50, 41, 33, 46, 46, 33,  0,  0,  0,  0,  0,  0,  0,  0])})

# **Setting GPU/CPU informtion**

In [14]:
use_cuda = torch.cuda.is_available()
print('use_cuda is', use_cuda)
global_timer = timer() # global timer
#logger = setup_logs(logging_dir, run_name) # setup logs
device = torch.device("cuda" if use_cuda else "cpu")

## Loading the dataset
params = {'num_workers': 0, 'pin_memory': False} if use_cuda else {}

use_cuda is True


# **Getting Loaders for training and validation**

In [26]:
print('===> loading train, validation and eval dataset')
train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True, **params) # set shuffle to True
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False, **params) # set shuffle to False

===> loading train, validation and eval dataset


# **Training and Validation process**

In [33]:
torch.cuda.empty_cache()

In [None]:
torch.autograd.set_detect_anomaly(True)

In [34]:
model = CDCK2([], vocab_size, embedding_dim, seq_len, threshold, negativeCounts, trainDataset.chr_to_idx, device).to(device)
optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()), lr = 1e-5, 
            betas=(0.9, 0.98), eps=1e-09, weight_decay=1e-4, amsgrad=True)
model_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('### Model summary below###\n {}\n'.format(str(model)))
print('===> Model total parameter: {}\n'.format(model_params))
## Start training
best_acc = 0
best_loss = np.inf
best_epoch = -1 

### Model summary below###
 CDCK2(
  (embedding): Embedding(96, 8)
  (encoder): Sequential(
    (0): Conv1d(1, 512, kernel_size=(3,), stride=(2,), padding=(1,), bias=False)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,), bias=False)
    (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,), bias=False)
    (7): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
  )
  (gru): GRU(512, 256, batch_first=True)
  (Wk): ModuleList(
    (0): Linear(in_features=256, out_features=512, bias=True)
    (1): Linear(in_features=256, out_features=512, bias=True)
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): Linear(in_features=256, out_features=512, bias=

In [40]:
%%bash
pip install --no-index --find-links ../input/torchsummary torchsummary

Looking in links: ../input/torchsummary


ERROR: Could not find a version that satisfies the requirement torchsummary
ERROR: No matching distribution found for torchsummary


CalledProcessError: Command 'b'pip install --no-index --find-links ../input/torchsummary torchsummary\n'' returned non-zero exit status 1.

In [39]:
from torchsummary import summary
print(summary(model, (1, 1, 16), device=device))

ModuleNotFoundError: No module named 'torchsummary'

In [41]:
for epoch in tqdm(range(1, epochs + 1)):
    epoch_timer = timer()

    # Train and validate
    #trainXXreverse(args, model, device, train_loader, optimizer, epoch, args.batch_size)
    #val_acc, val_loss = validationXXreverse(args, model, device, validation_loader, args.batch_size)
    trainCPC(model, device, train_loader, optimizer, epoch, batch_size)
    val_acc, val_loss = validationCPC(model, device, validation_loader, batch_size)

    # Save
    if val_acc > best_acc: 
        best_acc = max(val_acc, best_acc)
        snapshot(logging_dir, run_name, {
            'epoch': epoch + 1,
            'validation_acc': val_acc, 
            'state_dict': model.state_dict(),
            'validation_loss': val_loss,
            'optimizer': optimizer.state_dict(),
        })
        best_epoch = epoch + 1
    elif epoch - best_epoch > 2:
        optimizer.increase_delta()
        best_epoch = epoch + 1

    end_epoch_timer = timer()
    print("#### End epoch {}/{}, elapsed time: {}".format(epoch, epochs, end_epoch_timer - epoch_timer))

## end 
end_global_timer = timer()
print("################## Success #########################")
print("Total elapsed time: %s" % (end_global_timer - global_timer))





  4%|▍         | 1/25 [28:20<11:20:17, 1700.72s/it]

===> Validation set: Average loss: 3.4228	Accuracy: 0.0039

#### End epoch 1/25, elapsed time: 1700.714957487


  8%|▊         | 2/25 [56:49<10:53:44, 1705.42s/it]

===> Validation set: Average loss: 3.2233	Accuracy: 0.0039

#### End epoch 2/25, elapsed time: 1708.7146701870006


 12%|█▏        | 3/25 [1:24:27<10:17:19, 1683.61s/it]

===> Validation set: Average loss: 3.1672	Accuracy: 0.0039

#### End epoch 3/25, elapsed time: 1657.6569914549991


 16%|█▌        | 4/25 [1:51:35<9:41:39, 1661.87s/it] 

===> Validation set: Average loss: 3.1430	Accuracy: 0.0039

#### End epoch 4/25, elapsed time: 1628.530848773


 16%|█▌        | 4/25 [2:18:43<12:08:17, 2080.82s/it]

===> Validation set: Average loss: 3.1297	Accuracy: 0.0039






AttributeError: 'Adam' object has no attribute 'increase_delta'

In [None]:
snapshot('/kaggle/working', run_name, {
            'epoch': 1,
            'validation_acc': 1, 
            'state_dict': {},
            'validation_loss': 5,
            'optimizer': {},
        })