# Introduction

Metric Learning for Clustering Discrete Sequences

* https://stackoverflow.com/questions/38260113/implementing-contrastive-loss-and-triplet-loss-in-tensorflow
* http://scikit-learn.org/stable/modules/manifold.html


## Main idea
* use jaccard distance for rough distinction
* use labels for fine tuning 


## Preparation:
* define experiment X in config/all_experiments.py
* execute 010_generate_vocabulary.py -en X
* execute 020_generate_training_sequences.py -en X
* execute 025_extract_signatures.py -en X

## Papers and resources
 
* [1] FaceNet https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Schroff_FaceNet_A_Unified_2015_CVPR_paper.pdf
* [2] Siamese Network: http://yann.lecun.com/exdb/publis/pdf/chopra-05.pdf
* [3] Triplet Network: https://www.cv-foundation.org/openaccess/content_cvpr_2014/papers/Wang_Learning_Fine-grained_Image_2014_CVPR_paper.pdf




# Setup notebook and environment


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


## Imports




In [2]:
import tensorflow as tf
assert tf.__version__.startswith("1.4") # the version we used
import numpy as np
import os
from os.path import join as jp
import logging 
import library.helpers as h
import library.tensorflow_helpers as tfh
import time
from library.vocabulary import *
from tensorflow.contrib.tensorboard.plugins import projector # for visualizing embeddings
import re
import numpy as np
from sklearn.model_selection import train_test_split
from itertools import chain # chaining labeled examples
import testing
from sklearn.metrics.pairwise import pairwise_distances


import matplotlib # plotting stuff
matplotlib.use('Agg') # for displaying plots in console without display

## Configurations

In [3]:
# fix training
RANDOM_SEED = 0 
# configure numpy 
np.set_printoptions(precision=3, suppress=True)
np.random.seed(RANDOM_SEED)

# configure tensorflow
tf.set_random_seed(RANDOM_SEED)

# configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# configure ipython display
def show(img_file):
    try: # only works in ipython notebook
        display(Image(filename=img_file))
    except:
        pass

## Hyper Parameters 

In [4]:
BATCH_SIZE = 100 # 73 62 151 # for jaccard distances
NUM_EPOCHS = 30
JD_POS_THRESHOLD=0.30
JD_NEG_THRESHOLD=0.70
NUM_MAX_EXAMPLES_PER_CLASS = -1 # -1 for all 
NUM_LABELED_EXAMPLES = 1000 # -1 for all [1000, 2500, 5000]
MAX_LABELED_LINE_TOKENS = 50 # -1 for no restriction
test_fraction = 0.1

MAX_GRADIENT_NORM = 0.5
STATE_SIZE = 32 #  32
EARLY_STOPPING_THRESHOLD = 0.05

NUM_LSTM_LAYERS = 1
ALPHA_JACCARD = 0.8 # distance margin 
DTYPE = tf.float32 # datatype for network parameters
INVERTED = True 
LEARNING_RATE_DECAY_FACTOR = 0.95

TF_LEARNING_RATE = tf.Variable(0.01, trainable=False, name="Learning_rate") # alpha of our training step
TF_KEEP_PROBABILTIY = tf.Variable(0.90, trainable=False, name="Dropout_keep_probability") # can be added to feeddict
TF_GLOBAL_STEP = tf.Variable(0, trainable=False, name="Global_step") # keeps track of the current training step


ADD_EXTRA_POSITIVE_EXAMPLES_PER_CLASS = 2 # 

LOG_NAME = "bgl2" # [unix_forensic, bgl2, spirit2, synthetic_10, synthetic_reverse_10]
TAG_NUM = -1 # set >1 to use a specific tag
SHARD_SIZE=9460  #spirit2:7150 bgl2: 9460 unix_forensic: 1050

## Create directories and define file names

In [5]:
def fla(num):
    if num==0:
        return "non"
    if num<0:
        return "all"
    else:
        return "%0.5d"%num
h.create_dir("graphs") 
if TAG_NUM < 0:
    TAG = "%0.3d"%(len(os.listdir("graphs"))+1)
    DO_TRAINING = True
else:
    TAG = "%0.3d"%(TAG_NUM)
    DO_TRAINING = False
    
    
MODEL_NAME = "jd-la-x%s-pt%02d-nt%02d-ll%s-lc%s-ee%s-ep%0.2d"%( fla(NUM_LABELED_EXAMPLES), 
                                                       int(JD_POS_THRESHOLD*100), 
                                                       int(JD_NEG_THRESHOLD*100), 
                                                       fla(MAX_LABELED_LINE_TOKENS), 
                                                       fla(NUM_MAX_EXAMPLES_PER_CLASS),
                                                       fla(ADD_EXTRA_POSITIVE_EXAMPLES_PER_CLASS), 
                                                       NUM_EPOCHS
                                                      )


DATA_DIR = "data"
RESULTS_DIR = jp("results", LOG_NAME, MODEL_NAME)
h.create_dir(RESULTS_DIR)
RESULTS_FILE=jp(RESULTS_DIR, "%0.5d-results.csv")

RESULTS_DIR = "results"
VIZUALIZATIONS_DIR = "visualizations"
INPUTS_DIR = jp(DATA_DIR, "inputs")
ENCODER_INPUTS_PATH = jp(DATA_DIR, "encoder_inputs", "%s.idx"%LOG_NAME)
ENC_SEQUENCE_LENGTH_PATH = jp(DATA_DIR, "sequence_lengths", "%s_enc.idx"%LOG_NAME)
SIGNATURE_FILE =jp(DATA_DIR, "signatures","%s.sig"%LOG_NAME)
SIGNATURES = np.array(list(open(SIGNATURE_FILE)))

RAW_LOG = jp(DATA_DIR, "raw", "%s.log"%LOG_NAME)
LOGLINES = np.array([l[:-1] for l in list(open(RAW_LOG))])

h.create_dir(DATA_DIR)  # power traces go here
h.create_dir(INPUTS_DIR)
h.create_dir(VIZUALIZATIONS_DIR) # charts we generate
h.create_dir(RESULTS_DIR)

GRAPH_DIR = jp("graphs", "%s-%s"%(MODEL_NAME, TAG))
h.create_dir(GRAPH_DIR) # store tensorflow calc graph here 

#from library.parse_arguments import *

#h.import_all("signature_extraction.extract_%s"%LOG_NAME , glob=globals()) # load hyperparameters
logger.info("signature_extraction.extract_%s module loaded"%LOG_NAME)


INFO:library.helpers:Created directory: results/bgl2/jd-la-x01000-pt30-nt70-ll00050-lcall-ee00002-ep30
INFO:library.helpers:Created directory: graphs/jd-la-x01000-pt30-nt70-ll00050-lcall-ee00002-ep30-204
INFO:__main__:signature_extraction.extract_bgl2 module loaded


## Load vocabulary, get input statistics

In [6]:
NUM_SEQUENCES = len(SIGNATURES)
MAX_ENC_SEQ_LENGTH =  max([int(s) for s in list(open(ENC_SEQUENCE_LENGTH_PATH,"r"))])
VOCABULARY = Vocabulary.load(LOG_NAME, "")
SIGNATURE_FILE = jp(DATA_DIR, "signatures","%s.sig"%LOG_NAME)
SIGNATURES = np.array(list(open(SIGNATURE_FILE))).astype("int32")
LOGLINES_TO_ENCODE = NUM_SEQUENCES
EXAMPLES_BY_SIGNATURE_ID = {}
for sig_id, sig in enumerate(SIGNATURES):
    if not sig in EXAMPLES_BY_SIGNATURE_ID:
        EXAMPLES_BY_SIGNATURE_ID[sig]=[]
    EXAMPLES_BY_SIGNATURE_ID[sig].append(sig_id)
    
logger.info(" %i Sequences in dataset "%NUM_SEQUENCES)    
logger.info(" Vocabulary loaded, %i tokensS"%VOCABULARY.size())  
logger.info(" Max. Encoder Sequence Length: %s"%MAX_ENC_SEQ_LENGTH)

INFO:__main__: 474796 Sequences in dataset 
INFO:__main__: Vocabulary loaded, 101872 tokensS
INFO:__main__: Max. Encoder Sequence Length: 176


# Data

* Sequence of tokens $T$ 
* We build a vocabulary, which is a map of each unique item in the vocabulary to an integer

* To generate your training / test sequences, execute scripts: 010, 020, and 025. 




## Parse data to Memmap

* split line to integers
* add padding
* save into memmap if it does not exist


In [7]:
def parse_input_line(line, max_seq_length):
    split_line = line[:-1].split(" ") # cut \n at the end
    split_line_ints = [int(sl) for sl in split_line] # pad sequence with zeros
    padding  = [0] * (max_seq_length - len(split_line))
    padded_line_ints = split_line_ints +  padding
    return np.array(padded_line_ints)

def parse_input_file(input_file, output_file,  max_seq_length, force_regeneration=False, dtype="int32"):
    output_path = jp(INPUTS_DIR, output_file)
    if not h.file_exists(output_path) or force_regeneration:
        fp = np.memmap(output_path, dtype=dtype, mode='w+', shape=(NUM_SEQUENCES,max_seq_length))
        # save inputs to memmap
        for line_id, line in enumerate(list(open(input_file,"r"))):
            #print(line, parse_input_line(line, max_seq_length))
            fp[line_id,:]= parse_input_line(line, max_seq_length)
        
    else:
        logger.info(output_path +" already exists, delete it for regeneration.")
        fp = np.memmap(output_path, dtype=dtype, mode='r', shape=(NUM_SEQUENCES,max_seq_length))
    return fp


# load memmaps for seqlength (enc,dec) and (x_enc x_dec y_dec )
ENCODER_INPUTS  = parse_input_file(ENCODER_INPUTS_PATH, "enc_input-%s.mm"%LOG_NAME ,  MAX_ENC_SEQ_LENGTH, force_regeneration=True)
ENCODER_SEQLENGTH = np.array([int(s) for s in list(open(ENC_SEQUENCE_LENGTH_PATH,"r"))])

print("Encoder inputs shape: ",ENCODER_INPUTS.shape)
print("Encoder inputs(tok): ", VOCABULARY.index_seq_to_line(ENCODER_INPUTS[0:1,:].flatten()))
print("Encoder inputs(int):", ENCODER_INPUTS[0:1,:], "Length:",  ENCODER_SEQLENGTH[0])
print("")

Encoder inputs shape:  (474796, 176)
Encoder inputs(tok):  - time _ stamp short _ date node _ id _ 01 date _ time node _ id _ 01 ras kernel info instruction cache parity error corrected PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_TOKEN PAD_

## Select labeled examples

In [8]:
# SIGNATURES = labels by example id

def select_labeled_examples(log_lines, signatures, enc_seq_lengths, max_linelinegth= MAX_LABELED_LINE_TOKENS, max_num_classes=NUM_MAX_EXAMPLES_PER_CLASS, max_num_labeled_examples=NUM_LABELED_EXAMPLES ):
    labeled_examples = []
    labeled_signatures = []
    labeled_logs = []
    labeled_examples_by_sig_id = {}
    labeled_signatures_by_example_id = {}

    for example_id, l in enumerate(log_lines):
        if max_linelinegth>0 and enc_seq_lengths[example_id]>max_linelinegth: 
            continue # because we are interested in short loglines
        
        if max_num_labeled_examples>=0 and len(labeled_examples) >= max_num_labeled_examples:
            break # because we have enough
            
        #if l in LABELED_LOGS: 
        #    continue # because we already have a similar line
        
        sig_id = signatures[example_id]  
        if max_num_classes>0 and labeled_signatures.count(sig_id)>=max_num_classes:
            continue # because we already have enough examples of this signature

        # add example 
        labeled_examples.append(example_id)
        labeled_signatures.append(sig_id)
        labeled_logs.append(l)
        if not sig_id in labeled_examples_by_sig_id:
            labeled_examples_by_sig_id[sig_id]=[]
        labeled_examples_by_sig_id[sig_id].append(example_id)
        labeled_signatures_by_example_id[example_id]=sig_id 

        
            
    return labeled_examples_by_sig_id, labeled_signatures_by_example_id, labeled_logs
    



## Split data to train / test data


In [9]:
import math

def roundup(x, to=100):
    return int(math.ceil(x / to)) * to

def rounddown(x,to=100):
    return int(math.floor(x / to)) * to

np.random.seed(0)
random_permutation = np.random.permutation(ENCODER_INPUTS.shape[0])
#if not np.array_equal(random_permutation[0:10], [3114,5863,467,3232,1328,7450,3889,2458,4216,7636]):
#    print("Random permutation have to be same to be able to compare it to other methods")
#    assert False



TEST_START_INDEX = roundup(int(NUM_SEQUENCES*(1-test_fraction)))
TEST_END_INDEX = rounddown(NUM_SEQUENCES)
LOGLINES = np.array(list(open(jp(DATA_DIR, "raw", "%s.log"%LOG_NAME) )))

if test_fraction>0: # if a test / train fraction is defined 
    ENCODER_INPUTS_TRAIN  = ENCODER_INPUTS[random_permutation][0:TEST_START_INDEX] # all input sequences are allowed 
    ENCODER_SEQLENGTH_TRAIN = ENCODER_SEQLENGTH[random_permutation][0:TEST_START_INDEX]
    LABELS_TRAIN = SIGNATURES[random_permutation][0:TEST_START_INDEX] # only labels that are not used in the test examples are allowed  
    LOGLINES_TRAIN = LOGLINES[random_permutation][0:TEST_START_INDEX]

    ENCODER_INPUTS_TEST  = ENCODER_INPUTS[random_permutation][TEST_START_INDEX:TEST_END_INDEX]
    ENCODER_SEQLENGTH_TEST = ENCODER_SEQLENGTH[random_permutation][TEST_START_INDEX:TEST_END_INDEX]
    LABELS_TEST = SIGNATURES[random_permutation][TEST_START_INDEX:TEST_END_INDEX]
    #LOGLINES_TEST = LOGLINES[random_permutation][TEST_START_INDEX:TEST_END_INDEX] 
else: # otherwise use whole dataset for test / train
    0/0
   
# select labeled examples
labeled_examples_by_sig_id_TRAIN, labeled_signatures_by_example_id_TRAIN, labeled_logs_TRAIN = select_labeled_examples(LOGLINES_TRAIN, LABELS_TRAIN, ENCODER_SEQLENGTH_TRAIN)


print("Using %0.2f test fraction (0.0=all)"%test_fraction)
NUM_TRAININGS_SEQUENCES = ENCODER_INPUTS_TRAIN.shape[0]
NUM_TEST_SEQUENCES = ENCODER_INPUTS_TEST.shape[0] 
print("num training sequences:",NUM_TRAININGS_SEQUENCES)
print("num test sequences:", NUM_TEST_SEQUENCES)
print("using %i labeled examples"%len(labeled_logs_TRAIN))
STEPS_PER_EPOCH = int(NUM_TRAININGS_SEQUENCES / BATCH_SIZE)
DECAY_EVERY_X_STEPS = STEPS_PER_EPOCH # once per epoch decay learning rate 

for i in range(10, 500):
    if NUM_TRAININGS_SEQUENCES%i==0 and NUM_TEST_SEQUENCES%i==0:
        print("Batch sizes:", i)
        
print("\nNOTE: Trainings data / test data gets randomly permuted, don't use SIGNATURES / LOGLINES / ENCODER_INPUTS / ENCODER_SEQLENGTH variables directly")

Using 0.10 test fraction (0.0=all)
num training sequences: 427400
num test sequences: 47300
using 1000 labeled examples
Batch sizes: 10
Batch sizes: 20
Batch sizes: 25
Batch sizes: 50
Batch sizes: 100

NOTE: Trainings data / test data gets randomly permuted, don't use SIGNATURES / LOGLINES / ENCODER_INPUTS / ENCODER_SEQLENGTH variables directly


In [10]:
 if not NUM_TEST_SEQUENCES%SHARD_SIZE==0 :# need to be a square matrix
    print("Allowed shard sizes")
    for i in range(100, NUM_TEST_SEQUENCES):
        if NUM_TEST_SEQUENCES%i==0:
            print(i)
    0/0
    
    

# Dataset statistics 

In [11]:
print(LOG_NAME)

# train
train_sigs, train_counts = np.unique(LABELS_TRAIN, return_counts=True)
print("\nTRAIN")
print("train_classes", train_sigs.size )
print("train_class_avg_members", np.average(train_counts))
print("train_class_std_members", np.std(train_counts))
print("train_class_med_members", np.median(train_counts))
print("train_class_min", np.min(train_counts))
print("train_class_max", np.max(train_counts))

# test
test_sigs, test_counts = np.unique(LABELS_TEST, return_counts=True)
print("\nTEST")
print("test examples", LABELS_TEST.size)
print("test_classes", test_sigs.size) 
print("test_class_avg_members", np.average(test_counts))
print("test_class_std_members", np.std(test_counts))
print("test_class_med_members", np.median(test_counts))
print("test_class_min", np.min(test_counts))
print("test_class_max", np.max(test_counts))

    

bgl2

TRAIN
train_classes 352
train_class_avg_members 1214.20454545
train_class_std_members 8791.54340333
train_class_med_members 14.5
train_class_min 1
train_class_max 153554

TEST
test examples 47300
test_classes 245
test_class_avg_members 193.06122449
test_class_std_members 1162.35267415
test_class_med_members 10.0
test_class_min 1
test_class_max 16989


# Graph helper methods 

## Pairwise Label Equality

In [12]:
# %load ./includes/pairwise_label_equality.py
import tensorflow as tf

def pairwise_label_equality(labels):
    # check if labels are of correct size and type
    batch_size = labels.shape[0]
    assert len(labels.shape.as_list()) == 1, "expect labels to be a 1d tensor of ints of batch_size"
    assert labels.dtype == tf.int32 or labels.dtype==tf.int64, "expect labels to be a 1d tensor of ints of length batch_size"

    y_row = tf.expand_dims(labels,0) # [1,batch_size]
    new_shape = tf.shape(tf.transpose(y_row)) # [batch_size, 1]
    y_row_ary = tf.tile(input=y_row, multiples=new_shape ) # => [batch_size, batchtsize]
    pw_label_equality = tf.equal(y_row_ary, tf.transpose(y_row_ary))
    return pw_label_equality


In [13]:
testing.run_tests_on(pairwise_label_equality)

...
----------------------------------------------------------------------
Ran 3 tests in 0.099s

OK


## Pairwise euclidean distances

Calculates the euclidean distances for each row vector of one matrix to each other row vector of a second matrix.

In [14]:
# %load ./includes/pairwise_euclidean_distance.py
def pairwise_euclidean_distances(
    x1,  # x1 is a 2d tensor of dimension [nr1,b] h_t
    x2,  # x2 is a 2d tensor of dimension [nr2,d] h_s
    result_dtype=tf.float32
):
    x1 = tf.cast(x1, tf.float64) # perhaps cast to 
    x2 = tf.cast(x2, tf.float64)
    
    with tf.variable_scope("PairwiseEuclideanDistance"):
                
        x1_row_norm = tf.reduce_sum(tf.pow(x1,2), axis=1, keep_dims=True) # [n_x1_rows, 1]
        x2_row_norm = tf.reduce_sum(tf.pow(x2,2), axis=1, keep_dims=True) # [n_x2_rows, 1]

        squared_distances=tf.matmul(
            a=x1,
            b=x2,
            transpose_a=False,
            transpose_b=True,
        ) # => [n_x1_rows, n_x2_rows]
        squared_distances = -2 * squared_distances 
        squared_distances = squared_distances + x1_row_norm # => broadcast as row vector 
        pairwise_sqrd_euclidean_distances = tf.abs(squared_distances + tf.transpose(x2_row_norm)) # => broadcast as column vector; 
        # use tf abs, because pairwise sqrd can get small negative zero values
        #pairwise_sqrd_euclidean_distances = tf.abs(pairwise_sqrd_euclidean_distances) # because tensorflow knows -0 for very small numerical values
        pairwise_euclidean_distances = tf.sqrt(pairwise_sqrd_euclidean_distances)        
       
        return tf.cast(pairwise_euclidean_distances, result_dtype), tf.cast(pairwise_sqrd_euclidean_distances, result_dtype)

## Pairwise Jaccard Indices

Calculates the pairwise jaccard indices for each row vector of one matrix to each row vector of another matrix. Each row consists of a an set of integer variables. 


In [15]:
# %load ./includes/pairwise_jaccard_index.py
# https://en.wikipedia.org/wiki/Jaccard_index


# returns a 2d tensor of dimension [nr1,nr2]
# where each element contains the pairwise jaccard index of the respective row vectors of x1 and x2
# so the element result[0,0] is the pairwise jaccard index of x1[0,:] and x2[0,:], that is the first row vector of x1 and x2

def pairwise_jaccard_indices(
    x1,  # x1 is a 2d tensor of dimension [nr1,b]
    x2,  # x2 is a 2d tensor of dimension [nr2,d]
):
    with tf.variable_scope("PairwiseJaccardIndices"):
        n_x1_rows = tf.shape(x1)[0] # [nr1, seq_len]
        n_x2_rows = tf.shape(x2)[0] # [nr2, seq_len]
        # first we create a copy for each element row of x1 for each row of x2
        #
        # Example: 
        # x1 has two rows, rx1_1 and rx1_2 
        # x2 has three rows, rx2_1, rx2_2, rx2_3
        #
        # we want two 3d tensors, x1_tiled and x2_tiled that contain:
        # x1_tiled: [rx1_1, rx1_1,rx1_1, rx1_2, rx1_2, rx1_2] 
        # x2_tiled: [rx2_1, rx2_2,rx2_3, rx2_1, rx2_2, rx2_3] 
        # so that we can calculate the pairwise intersection  / union between each of these elements
        x1_expanded = tf.expand_dims(x1,1) # => [nr1,1,b] 
        x1_tiled = tf.tile( 
            input=x1_expanded, 
            multiples=[1, n_x2_rows, 1], 
        ) # => [nr1, nr2, b ]
        # 
        x2_expanded = tf.expand_dims(x2,0) # => [1, nr2, d] 
        x2_tiled = tf.tile( 
            input=x2_expanded, 
            multiples=[n_x1_rows,1, 1]
        )  # => => [nr1, nr2, d ]
        
        # new_shape = tf.shape(tf.transpose(y_row)) # [batch_size, 1]
        # y_row_ary = tf.tile(input=y_row, multiples=new_shape ) # => [batch_size, batchtsize]

        # calculate intersection 
        # we ignore zeros, because they are the padding elements   
        sparse_intersection = tf.sets.set_intersection(x1_tiled,x2_tiled) # 
        dense_intersection = tf.sparse_tensor_to_dense(sparse_intersection) 
        len_intersection = tf.count_nonzero( 
            input_tensor=dense_intersection,
            axis=2,
            keep_dims=False,
            dtype=tf.int32,
        ) # =>  [nr1, nr2]
        
       
        # calculate union
        sparse_union = tf.sets.set_union(x1_tiled, x2_tiled ) # sparse_tensor
        dense_union = tf.sparse_tensor_to_dense(sparse_union) # [nr1, nr2,  _ ]
        len_union = tf.count_nonzero( 
            input_tensor=dense_union,
            axis=2,
            keep_dims=False,
            dtype=tf.int32,
        ) # => [nr1, nr2]

        # get dice coefficent
        pairwise_dice_index = (len_intersection) / (len_union) # => [nr1, nr2]
        pairwise_jaccard_indices =  tf.cast(1 - pairwise_dice_index, tf.float32) # => [nr1, nr2]
        return pairwise_jaccard_indices
logger.info("Done")

INFO:__main__:Done


## Pairwise Labels in Batch

In [16]:
# %load ./includes/pairwise_labels_in_batch.py
# checks for each combination of an 1d array which labels are in the batch and which ones are not
def pairwise_labels_in_batch(labels):
    labels = tf.cast(labels, tf.int64) # because tf.tile has a weird behaviour that it erases values in an array
    batch_size = tf.shape(labels)[0]
    
    assert len(labels.shape.as_list()) == 1, "expect labels to be a 1d tensor of ints of batch_size"

    y_row = tf.expand_dims(labels,0) # [1, batchsize] 
    new_shape = tf.shape(tf.transpose(y_row)) # [batch_size, 1]
    y_row_ary = tf.tile(input=y_row, multiples=new_shape ) # => [batch_size, batchtsize]
    labels_in_batch = tf.logical_and(# IN BATCH
        tf.greater_equal(x=y_row_ary, y=tf.zeros_like(y_row_ary, dtype=y_row_ary.dtype)), 
        tf.greater_equal(x=tf.transpose(y_row_ary), y=tf.zeros_like(y_row_ary, dtype=y_row_ary.dtype)) 
    )
    return labels_in_batch

In [17]:
testing.run_tests_on(pairwise_labels_in_batch)

....
----------------------------------------------------------------------
Ran 4 tests in 0.101s

OK


## get batch 

Fetches a batch of sequences the input data and creates a feed dict. 

In [18]:
def get_batch(graph_nodes, # names of the graph nodes
              x_permutation, # index permutations (all xes)  
              enc_inputs, # input sequences (all training sequences)
              enc_seq_length, # input sequence lengths 
              labeled_sig_by_example_id, # training examples that are labeled example_id => signature_id 
              labeled_examples_by_signature_id, # signature_id => [example_id1, example_id2] 
              batch_size, # batch size to fetch               
              batch_index=0, # which batch sto get
              add_same_class_examples=0, 
              print_shapes=False):
    
    
    max_seq_length = enc_inputs.shape[1]
    
    idx_start = batch_index*batch_size
    idx_end = idx_start+batch_size
    x_ids_current_batch = x_permutation[idx_start:idx_end] # row_ids

    # find out which of the examples in our batch are labled  
    labeled_examples = [] # labeled signatures 
    for example_id in x_ids_current_batch:
        if example_id in labeled_sig_by_example_id:
            labeled_examples.append(labeled_sig_by_example_id[example_id])
    num_labeled_examples = len(labeled_examples)
    
    # define input containers 
    all_inputs = np.zeros(shape=(batch_size+add_same_class_examples*num_labeled_examples, max_seq_length), dtype="int32") # [BATCH_SIZE + 2 * num_labeled_examples]
    all_labels = np.zeros(shape=(batch_size+add_same_class_examples*num_labeled_examples), dtype="int32") # [BATCH_SIZE + 2 * num_labeled_examples]
    all_seq_length = np.zeros(shape=(batch_size+add_same_class_examples*num_labeled_examples), dtype="int32") # [BATCH_SIZE + 2 * num_labeled_examples]
    
    # add primary batch examples + labels + seq_length 
    all_inputs[0:batch_size,:] = enc_inputs[x_ids_current_batch,:]
    all_seq_length[0:batch_size] = enc_seq_length[x_ids_current_batch]
    y_lb_labels = []
    for x_id in x_ids_current_batch:
        if x_id in labeled_sig_by_example_id.keys(): # if the example is in the labeled examples
            y_lb_labels.append(labeled_sig_by_example_id[x_id])
        else: # otherwise add -1 to ignore it for anchor - positive - negative calculation
            y_lb_labels.append(-1)
            
    all_labels[0:batch_size] = y_lb_labels

    # add extra examples of the same class to create more positive examples
    i = 0 
    for j in range(add_same_class_examples):
        for sig_id in labeled_examples:
            #print(batch_size, i)
            rand_example_id = np.random.choice(labeled_examples_by_signature_id[sig_id]) 

            all_inputs[batch_size+i, :] = enc_inputs[rand_example_id]
            all_labels[batch_size+i] = sig_id
            all_seq_length[batch_size+i] = enc_seq_length[rand_example_id]
            i+=1
    
    batch_dict = {
        graph_nodes["x_jd"]:all_inputs,  
        graph_nodes["x_jd_seq_lengths"]:all_seq_length,  
        graph_nodes["y_lb_labels"]:all_labels,  
    }    
    
    if print_shapes:
        for k,v in batch_dict.items():
            logger.info("%s's shape: %s"%(k.name, v.shape))
    
    return batch_dict

In [19]:
# test get_batch
tb_graph_nodes = {
    "x_jd":"x_jd",
    "x_jd_seq_lengths":"x_jd_seq_lengths",
    "y_lb_labels":"y_lb_labels",
}

tb_x_permutation_1 = [0,1,2,3,4,5]
tb_enc_sequences = np.array([[1,3], [1, 2], [3,4], [5,5], [6,3], [7,4]])
tb_enc_seqlength = np.array([1,2,2,1,1,1])
tb_batch_size=2
tb_labeled_sigs_by_row_id = {0:3, 2:5}
tb_labeled_examples_by_sig_id = {3:[0], 5:[2]}


tb_b = get_batch(
    graph_nodes=tb_graph_nodes, 
    x_permutation=tb_x_permutation_1, 
    enc_inputs = tb_enc_sequences, 
    enc_seq_length= tb_enc_seqlength,
    labeled_sig_by_example_id=tb_labeled_sigs_by_row_id,
    labeled_examples_by_signature_id=tb_labeled_examples_by_sig_id,
    batch_size = tb_batch_size, 
    batch_index = 0
    )
assert tb_b['y_lb_labels'][0]==3
assert tb_b['y_lb_labels'][1]==-1
assert tb_b['x_jd_seq_lengths'][0]==1
assert tb_b['x_jd_seq_lengths'][1]==2
assert np.array_equal(tb_b['x_jd'][0], [1,3])
assert np.array_equal(tb_b['x_jd'][1], [1,2])
assert len(tb_b['y_lb_labels'])==2

tb_b = get_batch(
    graph_nodes=tb_graph_nodes, 
    x_permutation=tb_x_permutation_1, 
    enc_inputs = tb_enc_sequences, 
    enc_seq_length= tb_enc_seqlength,
    labeled_sig_by_example_id=tb_labeled_sigs_by_row_id,
    labeled_examples_by_signature_id=tb_labeled_examples_by_sig_id,
    batch_size = tb_batch_size, 
    batch_index = 1
    )
assert tb_b['y_lb_labels'][0]==5
assert tb_b['y_lb_labels'][1]==-1
assert tb_b['x_jd_seq_lengths'][0]==2
assert tb_b['x_jd_seq_lengths'][1]==1
assert np.array_equal(tb_b['x_jd'][0], [3,4])
assert np.array_equal(tb_b['x_jd'][1], [5,5])
assert len(tb_b['y_lb_labels'])==2

# test for different permutation
tb_b = get_batch(
    graph_nodes=tb_graph_nodes, 
    x_permutation=[5,1,4,2,3,0], 
    enc_inputs = tb_enc_sequences, 
    enc_seq_length= tb_enc_seqlength,
    labeled_sig_by_example_id=tb_labeled_sigs_by_row_id,
    labeled_examples_by_signature_id=tb_labeled_examples_by_sig_id,
    batch_size = tb_batch_size, 
    batch_index = 0
    )
assert tb_b['y_lb_labels'][0]==-1
assert tb_b['y_lb_labels'][1]==-1
assert tb_b['x_jd_seq_lengths'][0]==1
assert tb_b['x_jd_seq_lengths'][1]==2
assert np.array_equal(tb_b['x_jd'][0], [7,4])
assert np.array_equal(tb_b['x_jd'][1], [1,2])
assert len(tb_b['y_lb_labels'])==2

# test for different permutation + add class labels 
tb_b = get_batch(
    graph_nodes=tb_graph_nodes, 
    x_permutation=[5,1,4,2,3,0], # examples 
    enc_inputs = tb_enc_sequences, 
    enc_seq_length= tb_enc_seqlength,
    labeled_sig_by_example_id=tb_labeled_sigs_by_row_id,
    labeled_examples_by_signature_id=tb_labeled_examples_by_sig_id,
    batch_size = tb_batch_size, 
    add_same_class_examples = 1, 
    batch_index = 0
    )
assert len(tb_b['y_lb_labels'])==2 # should be two because non of the examples in the batch are labeled

tb_b = get_batch(
    graph_nodes=tb_graph_nodes, 
    x_permutation=tb_x_permutation_1, 
    enc_inputs = tb_enc_sequences, 
    enc_seq_length= tb_enc_seqlength,
    labeled_sig_by_example_id=tb_labeled_sigs_by_row_id,
    labeled_examples_by_signature_id=tb_labeled_examples_by_sig_id,
    batch_size = tb_batch_size,
    add_same_class_examples = 1, 
    batch_index = 0
    )

assert len(tb_b['y_lb_labels'])==3
assert tb_b['y_lb_labels'][0]==3
assert tb_b['y_lb_labels'][1]==-1
assert tb_b['y_lb_labels'][2]==3
assert np.array_equal(tb_b['x_jd'][2], [1,3])
assert tb_b['x_jd_seq_lengths'][2]==1

print(tb_b)



{'y_lb_labels': array([ 3, -1,  3], dtype=int32), 'x_jd': array([[1, 3],
       [1, 2],
       [1, 3]], dtype=int32), 'x_jd_seq_lengths': array([1, 2, 1], dtype=int32)}


# Model


## Input, Targets 

The input to our model are sequences of integer number. Each integer number denotes the id a certain word has in the vocabulary. The sequences of integers are zero padded in the training. For that, we add the sequence langth to 

We embedd this integers in a dense embedding matrix. 

In [20]:
graph_input_nodes = {} # store nodes that we want to use in our batch

# inputs
with tf.variable_scope("Word_embeddings"):
    TOKEN_EMBEDDINGS = tf.get_variable(
        name="word_embeddings",
        shape=[VOCABULARY.size(), STATE_SIZE],
        dtype=DTYPE,
        initializer=tf.truncated_normal_initializer(stddev=0.5),
        regularizer=None,
        trainable=True,
    ) # each row is a dense vector for each word.

# Encoder Inputs
with tf.variable_scope("Encoder_Inputs"):
    x_jd = tf.placeholder(tf.int64, [None, None], "jd_x") # encoder inputs loglines [BATCH_SIZE, num_steps]
    x_jd_seq_lengths = tf.placeholder(tf.int64, [None], "sequence_lengths") # [BATCH_SIZE]
    graph_input_nodes["x_jd"] = x_jd
    graph_input_nodes["x_jd_seq_lengths"] = x_jd_seq_lengths
    
    y_lb_labels = tf.placeholder(tf.int64, [None], "y") # signature for each loglines
    graph_input_nodes["y_lb_labels"] = y_lb_labels


## LSTM Encoder

Our encoding network. Just a plain LSTM. 

In [21]:
# %load ./includes/lstm_encode.py
# Encoder
def LSTMEncode(input_x, input_sequences_length, scope, name=""):
    input_sequences = tf.nn.embedding_lookup(TOKEN_EMBEDDINGS, input_x) # [BATCH_SIZE, max_time, embedding_size]
    
    encoder_cell = tf.contrib.rnn.LSTMCell(num_units=STATE_SIZE, state_is_tuple=True)
    encoder_cell = tf.contrib.rnn.DropoutWrapper(cell=encoder_cell,
                                         output_keep_prob=TF_KEEP_PROBABILTIY,
                                         input_keep_prob=TF_KEEP_PROBABILTIY,
                                         state_keep_prob=TF_KEEP_PROBABILTIY,
                                         dtype=DTYPE)
    encoder_cell = tf.contrib.rnn.MultiRNNCell(cells=[encoder_cell] * NUM_LSTM_LAYERS, state_is_tuple=True)

    encoder_outputs, last_encoder_state = tf.nn.dynamic_rnn(
        cell=encoder_cell,
        dtype=DTYPE,
        sequence_length=input_sequences_length,
        inputs=input_sequences,
        scope=scope
        )
    last_c, last_h = last_encoder_state[0] # h is hidden state, c = memory state https://arxiv.org/pdf/1409.2329.pdf
    z = tf.nn.l2_normalize(x=last_c, dim=1, epsilon=1e-12, name="OutputNormalization")   
    return z     

with tf.variable_scope("Encode_Inputs") as encode_scope:
    z_jd = LSTMEncode(x_jd, x_jd_seq_lengths, encode_scope)

## Encode loglines

Operation for encdoing all loglines into an embedding. We use this embedding to visualize it in tensorboard. 

In [22]:
def encode_loglines(enc_loglines, enc_seq_length, loglines, signatures, x_permutation, name="Batch1", batch_size=BATCH_SIZE):
    num_log_lines = int(loglines.shape[0]/batch_size)*batch_size # truncate batch to modulo zero of batch
    # define graph
    with tf.variable_scope("LogLineEncodings"):
        encoded_loglines = tf.get_variable(name=name, shape=[num_log_lines, STATE_SIZE], dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=False ) 
        tf_batch_idx = tf.Variable(0, trainable=False)
        update_indices = tf.range(tf_batch_idx*batch_size, tf_batch_idx*batch_size+batch_size)
        update_indices = tf.reshape(update_indices, shape = [-1, 1])
        encode_loglines_op = tf.scatter_nd_update(ref=encoded_loglines, indices=update_indices, updates=z_jd, name="encode_loglines_op")
        num_batches = int(num_log_lines / batch_size)
    
    tfh.initialize_unitialized_variables(session)
    
    for current_batch in range(0,num_batches):
        if current_batch%10==0:
            print("(%i/%i) batches encoded"%(current_batch, num_batches))
        # get batch
        feed_dict  = get_batch(
            graph_input_nodes, 
            x_permutation,  
            batch_index=current_batch, 
            enc_inputs=enc_loglines, 
            enc_seq_length=enc_seq_length, 
            batch_size=batch_size, 
            labeled_sig_by_example_id={}, 
            labeled_examples_by_signature_id={},
            add_same_class_examples=0
        )
        feed_dict[tf_batch_idx]=current_batch
        feed_dict[TF_KEEP_PROBABILTIY]=1.0
        # update slice in encoded lines
        _, rui = session.run([encode_loglines_op, update_indices], feed_dict=feed_dict)
        session.run(TF_GLOBAL_STEP.assign_add(1))
    
    saver = tf.train.Saver(tf.global_variables()) # Saver
    saver.save(session, jp(GRAPH_DIR, MODEL_NAME), global_step=TF_GLOBAL_STEP)
        
    logger.info("%i log lines embedded into %i dimensional space (%s)."%(num_log_lines, STATE_SIZE, name))

    # write metadata
    LOGLINE_EMBEDDINGS_METADATA_FN = '%s-%s-%s.tsv'%(LOG_NAME, "logline_embeddings",name)

    labels = []

    with open( jp(GRAPH_DIR, LOGLINE_EMBEDDINGS_METADATA_FN) , "w") as mdf:
        mdf.write("%s\t%s\t%s\n"%("ProcessId", "SignatureId", "LogLine"))
        for line_id, line in enumerate(loglines):
            if line_id>=num_log_lines:
                break
            
            process_id = line.split("]")[0][1:].split(" ")[0]
            sanitized_line = re.sub(r'[^\x00-\x7F]+',' ', line[:-1]) # replace all non ascii characters with a space

            line = "%s\t%s\t%s"%(process_id,int(signatures[line_id]), sanitized_line)+"\n"

            mdf.write(line)
            labels.append(int(signatures[line_id]))

    link_embedding_to_metadata(
        embedding_var=encoded_loglines, 
        metadata_file=LOGLINE_EMBEDDINGS_METADATA_FN
    )
    return labels, encoded_loglines # labels are signature ids 
    

# Cost function / objective

Intuitively, we are minimizing the distance in differences of the positive and the negative example. That is, want the positive examples to be closer to each other and the negative example farther apart of each other. 

Given our encoding function $e$, an anchor example $x_i^a$, a postive example $x_i^p$, a negative example $x_i^n$ and a margin $\alpha$", we minimize the following loss per trainingsexample $i$ of our batch:

$$
l_i = max\left(0,\|e(x_i^a)-e(x_i^p)\|_2^2 - \|e(x_i^a)-e(x_i^n)\|_2^2 + \alpha\right)
$$



*  Reference: [1], equation (3)


## get anchor positive examples

Each row in the batch tensor is treated as anchor example. We want to calculate whether the other examples in this batch are positive to each anchor example. 

To determine whether the other examples are positive, we use two factors: label_equality if we both examples are labeled and jaccard distance of the sequence if one of the two examples is not labeled. 

To calculate the positive examples we use pairwise comparison from each row to each other element in the batch. This usually leads to matrices of dimensions [batch_size, batch_size], where and element $x_{ij}$ of such a matrix means the value of $x_i$ compared to $x_j$.

In [23]:
# %load ./includes/anchor_positive_examples.py
# returns all euclidean distances for which either the labels are the same or the jaccard distance is below the positive threshold
def anchor_positive_examples(pw_label_equality, labels_in_batch, pw_jaccard_distances, pw_euclidean_distances, jd_pos_threshold):
    batch_size = tf.shape(pw_label_equality)[0]
    labels_not_in_batch = tf.logical_not(labels_in_batch) # labels in batch is a bad name. It should be - we have labeled exampled for this example
    
    # positive conditions
    labels_match = tf.not_equal(pw_label_equality, tf.eye(batch_size, dtype=tf.int32)) # exclude equality between same elements
    pw_ji_for_pos = tf.add(pw_jaccard_distances, tf.eye(batch_size)*1.5) # jaccard distance is between 0 and 1 - exclude equality between same elements
    sequences_have_pos_jd = tf.less(x=pw_ji_for_pos, y=jd_pos_threshold, name="jd_pos_cond") # sequences are
    
    # it's either an anchor-positive example because the jaccard distance is smaller than the threshold or because the labels are the same. 
    pos_because_of_labels = tf.logical_and(labels_in_batch, labels_match)
    pos_because_of_jd  = tf.logical_and(labels_not_in_batch, sequences_have_pos_jd)
    pos_cond = tf.logical_or(pos_because_of_labels, pos_because_of_jd)
    
    # exclude example itself from positive / negative  - euclidean distance to between two identical vectors should always be 0
    positive_ed = tf.where(condition=pos_cond , x=pw_euclidean_distances, y=tf.ones_like(pw_euclidean_distances)*-1) # -1 means non positive
    return positive_ed, pos_because_of_labels,  pos_because_of_jd
logger.info("anchor_positive_examples defined.")

INFO:__main__:anchor_positive_examples defined.


In [24]:
testing.run_tests_on(anchor_positive_examples)   

..........
----------------------------------------------------------------------
Ran 10 tests in 0.796s

OK


## get anchor negative examples

Same as anchor positive examples, only the other case. 

In [25]:
# %load ./includes/anchor_negative_examples.py
def anchor_negative_examples(pw_label_equality, labels_in_batch, pw_jaccard_distances, pw_euclidean_distances, jd_neg_threshold):
    batch_size = tf.shape(pw_label_equality)[0]
    labels_not_in_batch = tf.logical_not(labels_in_batch)
    
    # make sure to exclude jaccard distances of the diagonal, because elements to itself are never negative
    pw_ji_for_neg = tf.add(pw_jaccard_distances,  tf.eye(batch_size)*-1.0)
    
    # negative condition
    labels_dont_match = tf.equal(pw_label_equality, tf.zeros_like(pw_label_equality, dtype=tf.int32),  name="la_neg_cond")
    sequences_have_neg_jd = tf.greater_equal(x=pw_ji_for_neg, y=jd_neg_threshold, name="jd_neg_cond") # elements at the diagonal should aways have zero, so 
    
    neg_because_of_labels = tf.logical_and(labels_in_batch, labels_dont_match) # all labels that are not equal to 1
    neg_because_of_jd = tf.logical_and(labels_not_in_batch, sequences_have_neg_jd)  
    neg_cond = tf.logical_or(neg_because_of_labels, neg_because_of_jd) # it's either negative because the jaccard distance is over the threshold or the labels are not matching                                           
    
    negative_ed = tf.where(condition=neg_cond , x=pw_euclidean_distances, y=tf.ones_like(pw_euclidean_distances)*-1)
    return negative_ed, neg_because_of_labels, neg_because_of_jd

logger.info("anchor_negative_examples defined.")

INFO:__main__:anchor_negative_examples defined.


In [26]:
testing.run_tests_on(anchor_negative_examples)

............
----------------------------------------------------------------------
Ran 12 tests in 1.275s

OK


## Get pairwise jaccard indizes for training sequences

Sørensen's original formula:
$$ 
    Q_{søresen} = \frac{2 * | X \cap Y |}{| X |+| Y |}
$$ 
with $ |X| $ being the length of the set. In our situation, we treat each token of our sequence es element of the set. 

We use the Jaccard index because we want to have a distance that we want to minimze instead. The dice coefficent is 0 for totally non overlapping to 1 for identical strings, whereas the jaccard is 0 if two sequences are identicall, and 1 if they are completely different.  

$$
    Q_{jaccard} = 1 - Q_{søresen}
$$

In [27]:
# %load ./includes/get_positive_and_anchor_examples.py
# https://stackoverflow.com/questions/41806689/tensorflow-get-indices-of-array-rows-which-are-zero
# get pairwise jaccard indizes for all samples in the batch


"""
    Get all anchor-positive / anchor-negative example permuations in  batch
"""
def get_positive_and_anchor_examples(
                                    input_x, # input sequences 
                                    input_z, # encoded input sequences in embedding space z
                                    labels_y, # labels for input sequences
                                    jd_pos_threshold=JD_POS_THRESHOLD, jd_neg_threshold=JD_NEG_THRESHOLD):
    # get pairwise jaccard indices for input sequences
    batch_size = tf.shape(input_x)[0]
    
    # get pairwise jaccard distances for input sequences
    pw_ji = pairwise_jaccard_indices(x1=input_x, x2=input_x)  # => [BATCH_SIZE, BATCH_SIZE]
  
    
    # get pairwise euclidean distances for encoded input sequneces
    _ , pw_sq_ed = pairwise_euclidean_distances(x1=input_z, x2=input_z)  # => [BATCH_SIZE, BATCH_SIZE]

    # get pairwise label equality
    pw_lbl_eq = pairwise_label_equality(labels_y)
    pw_lbl_eq = tf.cast(pw_lbl_eq, tf.int32) # => [BATCH_SIZE, BATCH_SIZE] 
        
    # check whether labels are in batch
    labels_in_batch = pairwise_labels_in_batch(labels_y)
   
    # get anchor-positive examples and anchor-negative examples
    anchor_positive_ed, pos_because_of_labels, pos_because_of_jd = anchor_positive_examples(pw_lbl_eq, labels_in_batch, pw_ji, pw_sq_ed, jd_pos_threshold)
    anchor_negative_ed, neg_because_of_labels, neg_because_of_jd = anchor_negative_examples(pw_lbl_eq, labels_in_batch, pw_ji, pw_sq_ed, jd_neg_threshold)

    # get all combinations between a=>p, a=>n for each row 
    # example: assume positive_ed = [[a,b], [c,d]] and negative_ed = [[e,f],[g,h]]
    # then pos_row =
    # [
    #   [a,a],
    #   [b,b],
    #   [c,c],
    #   [d,d]
    # ]
    # and neg_col = 
    # [
    #   [e,f],
    #   [e,f],
    #   [g,h],
    #   [g,h]
    # ]
    # 
    # if you use this arrays for comparison, you will have all possible combinations within one row. 
    # if you calculate the distance between pos_row and neg_col, you will get:
    # [
    #   [a-e, a-f], 
    #   [b-e, b-f], 
    # 
    #  ...
    # ]   
    pos_row = tf.tile(tf.reshape(anchor_positive_ed, [-1, 1]), [1, batch_size])
    neg_col = tf.reshape(tf.tile(anchor_negative_ed, [1 , batch_size]), [-1, batch_size])
    
    # get statistics on how many examples where anchor-positive and how many were anchor negative
    num_neg_la = tf.reduce_sum(tf.cast(neg_because_of_labels, dtype=tf.int32)) 
    num_neg_jd = tf.reduce_sum(tf.cast(neg_because_of_jd, dtype=tf.int32))
    num_pos_la = tf.reduce_sum(tf.cast(pos_because_of_labels, dtype=tf.int32)) 
    num_pos_jd = tf.reduce_sum(tf.cast(pos_because_of_jd, dtype=tf.int32)) 
    stats = (num_neg_la, num_neg_jd, num_pos_la, num_pos_jd)
    
    return pos_row, neg_col, stats

In [28]:
import tensorflow as tf
import numpy as np
import unittest
from unittest import *

x1 = np.array([
    [1,2,3,4],
    [4,5,6,1],
    [8,2,3,4]
])

z1 = np.array([
    [0.2,0.4],
    [0.8,0.2],
    [0.3,0.3],
])

y1 = np.array([1,2,1])


def run_tests_on(get_positive_and_anchor_examples):
    
    class TestGetPositiveAndAnchorExamplesJDLA(TestCase):
        def setUp(self):
            self.input_x = tf.placeholder(shape=[None, None], dtype=tf.int32) # [batch_size, max_seq_length]
            self.input_z = tf.placeholder(shape=[None, None], dtype=tf.int32) # [batch_size, embedding_dim]
            self.labels_y = tf.placeholder(shape=[None], dtype=tf.int32) # [batch_size,1]
            self.jd_pos_threshold = tf.placeholder(shape=(), dtype=tf.float32)
            self.jd_neg_threshold = tf.placeholder(shape=(), dtype=tf.float32)
            
            self.op = get_positive_and_anchor_examples(
                input_x=self.input_x, 
                input_z=self.input_z, 
                labels_y=self.labels_y,
                jd_pos_threshold=self.jd_pos_threshold,
                jd_neg_threshold=self.jd_neg_threshold
            )

            self.session = tf.Session()
            self.session.run(tf.global_variables_initializer())
            
        def run_op(self, x,z,y,pt,nt):#shortcut
            return self.session.run(self.op,feed_dict={
                self.input_x:x, 
                self.input_z:z, 
                self.labels_y:y, 
                self.jd_pos_threshold:pt, 
                self.jd_neg_threshold:nt
            })
        

        def test_shapes(self):
            batch_size = x1.shape[0]
            pos_row, neg_col, stats =  self.run_op(x1,z1,y1, 0.2, 0.8)
            
            self.assertEqual(pos_row.shape[0], batch_size * batch_size) 
            self.assertEqual(pos_row.shape[1], batch_size)
            self.assertEqual(neg_col.shape[0], batch_size * batch_size)
            self.assertEqual(neg_col.shape[1], batch_size)
        
        def test_pos_row_value_equality(self):
            batch_size = x1.shape[0]
            pos_row, neg_col, stats =  self.run_op(x1,z1,y1, 0.2, 0.8)
            
            for row_id in range(batch_size*batch_size):
                for col_id in range(batch_size):
                    self.assertEqual(pos_row[row_id][0], pos_row[row_id][col_id])
        
        def test_neg_col_value_equality(self):
            batch_size = x1.shape[0]
            pos_row, neg_col, stats =  self.run_op(x1,z1,y1, 0.2, 0.8)
            for row_id in range(batch_size*batch_size):
                for col_id in range(batch_size):
                    self.assertEqual(neg_col[int(row_id/batch_size)*batch_size][col_id], neg_col[row_id][col_id])
                

        def tearDown(self):
            self.session.close()
               
    suite = unittest.TestSuite()
    suite.addTests(unittest.TestLoader().loadTestsFromModule(TestGetPositiveAndAnchorExamplesJDLA()))
    unittest.TextTestRunner().run(suite)  

# 

In [29]:
# TODO move to tests
run_tests_on(get_positive_and_anchor_examples)


...
----------------------------------------------------------------------
Ran 3 tests in 0.493s

OK


## permutation hinge loss

In [30]:
def permutation_hinge_loss(pos_anchor_dist, neg_anchor_dist, alpha=ALPHA_JACCARD, use_mean_loss=False):
    # pos_anchor_dist = [batch_size * batchsize, batch_size]  
    batch_size=tf.shape(pos_anchor_dist)[1]
    
    # condition: exclude all invalid examples
    # we want: distance a=>n should be larger than the distance a=>p+margin
    # we want to catch examples where the distance a=>n-margin is smaller than the distance of the positive anchors
    neg_greater_zero = tf.greater_equal(neg_anchor_dist, tf.zeros_like(neg_anchor_dist)) # all the valid negative examples
    pos_greater_zero = tf.greater_equal(pos_anchor_dist, tf.zeros_like(pos_anchor_dist)) # permuted with all valid positive ones
    d_pos_less_than_d_neg = tf.less(x=neg_anchor_dist-alpha, y=pos_anchor_dist, name="constraint_cond") # which violate distance anchor-positive <  anchor-negative
    
    # loss calculation for all permutations
    loss = tf.maximum(pos_anchor_dist-neg_anchor_dist+alpha, 0, "hinge_loss") # loss is small if  pos_anchor_dist is large and neg_anchor dist is small 
    permuations_loss  = tf.where(tf.logical_and(tf.logical_and(neg_greater_zero,pos_greater_zero),d_pos_less_than_d_neg),loss,  tf.zeros_like(pos_anchor_dist))
    # => shape [BATCH_SIZE*BATCH_SIZE, BATCH_SIZE]. This shape is because we only want all possible combination per row of the batch

    num_non_zero_perms = tf.reduce_sum(tf.cast(tf.greater(x=permuations_loss, y=tf.zeros_like(permuations_loss)), tf.float32))
    mean_permutation_loss = tf.reduce_sum(permuations_loss , axis=1) / tf.maximum(num_non_zero_perms, 1)  # only calculate mean between non-zero calculation losses, because 0 means invalid
    # => shape [BATCH_SIZE*BATCH_SIZE,1]

    per_example_loss = tf.reshape(mean_permutation_loss, [batch_size,batch_size]) # all valid permutations per example
    num_exp_loss_greater_zero = tf.reduce_sum(tf.cast(tf.greater(x=per_example_loss, y=tf.zeros_like(per_example_loss)), tf.float32))
    
    if use_mean_loss:
        return tf.reduce_sum(per_example_loss) / num_exp_loss_greater_zero, num_non_zero_perms
    else:
        return tf.reduce_sum(per_example_loss), num_non_zero_perms
logger.info("Done.")

INFO:__main__:Done.


## Loss calculation

In [31]:
with tf.variable_scope("jd_la_ma_hinge_loss"):
    jd_pos_anchor_dist, jd_neg_anchor_dist, loss_stats = get_positive_and_anchor_examples(input_x=x_jd, input_z=z_jd, labels_y=y_lb_labels)
    jd_la_ma_hinge_loss, num_non_zero_perms_op = permutation_hinge_loss(jd_pos_anchor_dist, jd_neg_anchor_dist, alpha=ALPHA_JACCARD)
logger.info("done")

INFO:__main__:done


In [32]:
tfh.ensure_gradient_flow(operations=[jd_la_ma_hinge_loss]) 
total_loss_op = jd_la_ma_hinge_loss


INFO:library.tensorflow_helpers:Operations [['jd_la_ma_hinge_loss/Sum_7:0']] have at least 1 gradient for at least 1 parameter


# Optimization procedure

The gradient descent algorithm. Get the gradients, multiply it with the learning rate and apply it to the parameters. 

In [33]:
with tf.variable_scope("Training_Step"):
    # adjust learning rate operation
    tf_learning_rate = tf.train.exponential_decay(
      learning_rate= TF_LEARNING_RATE,
      global_step=TF_GLOBAL_STEP,
      decay_steps=DECAY_EVERY_X_STEPS, 
      decay_rate= LEARNING_RATE_DECAY_FACTOR,
      staircase=True)

    # get gradients for all trainable parameters with respect to our loss funciton
    tf_params = tf.trainable_variables()
    optimizer = tf.train.RMSPropOptimizer(learning_rate=tf_learning_rate)
    gradients = tf.gradients(total_loss_op, tf_params)
    
    # apply gradient clip
    clipped_gradients, gradient_norm = tf.clip_by_global_norm(gradients,MAX_GRADIENT_NORM)

    # Update operation
    training_step_op  = optimizer.apply_gradients(zip(clipped_gradients, tf_params), global_step=TF_GLOBAL_STEP) # learing rate decay is calulated based on this step

# Training 




## Summaries 

In [34]:
# keep track of loss over time
tf.summary.scalar("hinge_loss",tf.cast(jd_la_ma_hinge_loss , tf.float32)) # summary for loss
tf.summary.scalar('total_loss', tf.cast(total_loss_op, tf.float32))

# examples in batch
tf.summary.scalar("num_non_zero_perms", tf.cast(num_non_zero_perms_op, tf.float32))

# num examples jaccard distance
tf.summary.scalar("num_la_neg_examples",tf.cast(loss_stats[0], tf.float32)) # summary for loss
tf.summary.scalar('num_jd_neg_examples', tf.cast(loss_stats[1], tf.float32))
tf.summary.scalar("num_la_pos_examples",tf.cast(loss_stats[2], tf.float32)) # summary for loss
tf.summary.scalar('num_jd_pos_examples', tf.cast(loss_stats[3], tf.float32))


# historgrams for all trainable variables
for tv in tf.trainable_variables():
    tf.summary.histogram(tv.name.replace(":","_"),tv)  # summary for loss
  
 
WORD_EMBEDDINGS_METADATA_FN = '%s-%s.tsv'%(LOG_NAME, "token_embeddings")
with open( jp(GRAPH_DIR, WORD_EMBEDDINGS_METADATA_FN) , "w") as mdf:
    for idx in sorted([int(k) for k in VOCABULARY.index_to_token.keys()]):
         mdf.write("%s\n"%(VOCABULARY.idx_to_token(idx)))
logger.info("Done")

INFO:__main__:Done


## Write metadata for token embeddings

In [35]:
def link_embedding_to_metadata(embedding_var, metadata_file):
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_var.name.replace(":0","")
    embedding.metadata_path = metadata_file
    summary_writer = tf.summary.FileWriter(GRAPH_DIR)
    projector.visualize_embeddings(summary_writer, config)

link_embedding_to_metadata(TOKEN_EMBEDDINGS, WORD_EMBEDDINGS_METADATA_FN )


## Initialize Tensorflow session

In [None]:
session = tf.Session()
#logger.info("Random Seed: %0.3f"%session.run(tf.random_normal([1], mean=-1, stddev=4, seed=RANDOM_SEED))[0])

# summaries for training
all_summaries_op = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(GRAPH_DIR)
summary_writer.add_graph(session.graph)

# restor graph from last checkpoint
saver = tf.train.Saver(tf.global_variables()) # Saver
session.run([
    tf.local_variables_initializer(),
    tf.global_variables_initializer(),
])

logger.info("Vocabulary Size: %i, max. sequence length(enc): %i"%(VOCABULARY.size(), MAX_ENC_SEQ_LENGTH))
logger.info("Trainings examples: %i,  batch size: %i"%(NUM_TRAININGS_SEQUENCES, BATCH_SIZE))
logger.info("Epochs: %i, batches per epoch:%i\n"%(NUM_EPOCHS, STEPS_PER_EPOCH ))

#session.graph.finalize() # prevent nodes beeing added to graph
st = time.time() # timing
current_step=0
TOTAL_STEPS = NUM_EPOCHS*STEPS_PER_EPOCH


INFO:__main__:Vocabulary Size: 101872, max. sequence length(enc): 176
INFO:__main__:Trainings examples: 427400,  batch size: 100
INFO:__main__:Epochs: 30, batches per epoch:4274



## Trainingsloop

In [None]:
logger.info("Started training... ")
early_stopping=0
for current_epoch in range(0,NUM_EPOCHS):
    # shuffle trainings data
    x_permutation = np.random.permutation(ENCODER_INPUTS_TRAIN.shape[0])
    epoch_losses = []
    epoch_valid_perms = []
    
    for current_batch in range(0, STEPS_PER_EPOCH):
        current_step = session.run(TF_GLOBAL_STEP) # will be updated by adam

        # get current batch
        feed_dict  = get_batch( # expects unshuffled inputs
            graph_input_nodes,
            x_permutation,
            batch_index=current_batch, 
            enc_inputs=ENCODER_INPUTS_TRAIN, enc_seq_length=ENCODER_SEQLENGTH_TRAIN, 
            batch_size=BATCH_SIZE, 
            labeled_sig_by_example_id=labeled_signatures_by_example_id_TRAIN, labeled_examples_by_signature_id=labeled_examples_by_sig_id_TRAIN,  # this needs to be for trainings data as well
            add_same_class_examples=ADD_EXTRA_POSITIVE_EXAMPLES_PER_CLASS, 
            print_shapes=False
        )
        
        # run trainings operation
        _, summaries, total_loss, valid_perms = session.run(fetches=[
                training_step_op,
                all_summaries_op,
                total_loss_op,
                num_non_zero_perms_op, 
            ],
            feed_dict=feed_dict
        )
        # write summaries and metadata info to graph
        summary_writer.add_summary(summaries, current_step)
        
        epoch_valid_perms.append(valid_perms)
        
        # print training progress every now and then
        at_tenth_of_an_epoch = (current_step+1)%( max(1,int(STEPS_PER_EPOCH/2)) )==0
        if at_tenth_of_an_epoch:
            time_dif_in_s =  (time.time()-st)
            time_per_step = time_dif_in_s/current_step
            steps_left = TOTAL_STEPS-current_step
            eta = steps_left*time_per_step/60
            
            logger.info("[Train]: Epoch (%0.2d/%0.2d), Batch: (%0.3d/%0.3d)"%(current_epoch+1, NUM_EPOCHS, current_batch+1,STEPS_PER_EPOCH))
            logger.info("[Train]: Total loss: %0.5f, ETA: ~%0.3fm"%(total_loss, eta))

        if total_loss>0.01: # only consider losses where valid triplets have been found
            epoch_losses.append(total_loss)
    
    logger.info("###################################")        
    logger.info("[EPOCH %0.3d TRAIN]: Mean Epoch Loss: %0.5f, Mean Valid Perms: %0.1f\n\n"%(current_epoch+1, np.mean(epoch_losses), np.mean(epoch_valid_perms) ) )
    
    if current_epoch%5==0:
        logger.info ("Saving checkpoint, current learningrate: %0.5f"%session.run(tf_learning_rate))
        saver.save(session, jp(GRAPH_DIR, MODEL_NAME), global_step=TF_GLOBAL_STEP)

    if np.mean(epoch_valid_perms)<10:
        early_stopping+=1
        
    if early_stopping>=10:
        logger.info("Early stopping reached.")
        #break
    
et = time.time()

INFO:__main__:Started training... 
INFO:__main__:[Train]: Epoch (01/30), Batch: (2137/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~521.506m
INFO:__main__:[Train]: Epoch (01/30), Batch: (4274/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~516.037m
INFO:__main__:###################################
INFO:__main__:[EPOCH 001 TRAIN]: Mean Epoch Loss: 0.17917, Mean Valid Perms: 13.2


INFO:__main__:Saving checkpoint, current learningrate: 0.00950
INFO:__main__:[Train]: Epoch (02/30), Batch: (2137/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~509.409m
INFO:__main__:[Train]: Epoch (02/30), Batch: (4274/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~501.679m
INFO:__main__:###################################
INFO:__main__:[EPOCH 002 TRAIN]: Mean Epoch Loss: 0.13491, Mean Valid Perms: 1.3


INFO:__main__:[Train]: Epoch (03/30), Batch: (2137/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~493.652m
INFO:__main__:[Train]: Epoch (03/30), Batch: (4274/4274

INFO:__main__:[Train]: Epoch (23/30), Batch: (4274/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~128.659m
INFO:__main__:###################################
INFO:__main__:[EPOCH 023 TRAIN]: Mean Epoch Loss: 0.12464, Mean Valid Perms: 0.3


INFO:__main__:Early stopping reached.
INFO:__main__:[Train]: Epoch (24/30), Batch: (2137/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~119.481m
INFO:__main__:[Train]: Epoch (24/30), Batch: (4274/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~110.306m
INFO:__main__:###################################
INFO:__main__:[EPOCH 024 TRAIN]: Mean Epoch Loss: 0.12434, Mean Valid Perms: 0.4


INFO:__main__:Early stopping reached.
INFO:__main__:[Train]: Epoch (25/30), Batch: (2137/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~101.126m
INFO:__main__:[Train]: Epoch (25/30), Batch: (4274/4274)
INFO:__main__:[Train]: Total loss: 0.00000, ETA: ~91.940m
INFO:__main__:###################################
INFO:__main__:[EPOCH 025 TRAI

In [None]:
print("Time: %0.3f"%( (et-st)/60 ))


## Encode log lines

* Encode all log lines with the final learned model

In [None]:
n_test_examples = ENCODER_INPUTS_TEST.shape[0]
batch_size = 100
num_embedding_steps = n_test_examples // batch_size +1
EMBEDDED_TEST_INPUT = np.memmap(filename=jp(DATA_DIR, LOG_NAME+"_test_embedded.mm"), shape=(n_test_examples,STATE_SIZE), dtype="float32", mode="w+")

logger.info("encoding %i test examples"%n_test_examples)
for i in range(num_embedding_steps):
    if (i+1)%10==0:
        print("Step %i/%i"%(i,num_embedding_steps))
    start_index = i * batch_size
    end_index = min((i+1)*batch_size, n_test_examples )
    
    batch_enc_inp = ENCODER_INPUTS_TEST[start_index:end_index, :]
    batch_seq_len = ENCODER_SEQLENGTH_TEST[start_index:end_index]
    
    feed_dict = {x_jd:batch_enc_inp, x_jd_seq_lengths:batch_seq_len, TF_KEEP_PROBABILTIY:1.0}
    EMBEDDED_TEST_INPUT[start_index:end_index,:] = session.run(z_jd, feed_dict=feed_dict ) 


logger.info("Done.")

# Evaluation  

* http://www.johnmyleswhite.com/notebook/2014/03/24/a-note-on-the-johnson-lindenstrauss-lemma/
* https://stackoverflow.com/questions/25104733/how-to-efficiently-calculate-huge-matrix-multiplication-tfidf-features-in-pyth

## Validation rate (d)

In [None]:
# %load ./includes/validation_rate.py
def evaluate_shard(out_csv_name, pw_ji, labels_x, labels_y, d = 0.00, d_step = 0.005, d_max=1.0):
    
    h.save_to_csv(data_rows=[[
        "Distance Threshhold",
        "True Positives", 
        "False Positives", 
        "True Negative", 
        "False Negative", 
        "Num True Same", 
        "Num True Diff", 
    ]], outfile_name=out_csv_name, mode="w")
    
    
    # calculate true accepts / false accepts based on labels
    n_labels = len(labels_x)
    tl_row = np.repeat( np.array(labels_x).reshape((n_labels,1)), n_labels, axis=1 )
    tl_col = np.repeat( np.array(labels_y).reshape((1,n_labels)), n_labels, axis=0 ) 
    p_same = np.equal(tl_row, tl_col).astype("int8")
    p_diff = np.not_equal(tl_row, tl_col).astype("int8")
    num_true_same = p_same.sum()
    num_true_diff = p_diff.sum()
    
    while True:
        calc_same = np.zeros((n_labels, n_labels))
        calc_same[np.where(pw_ji<=d)]=1
        
        tp = np.sum(np.logical_and(calc_same, p_same))
        fp = np.sum(np.logical_and(calc_same, np.logical_not(p_same)))
        tn = np.sum(np.logical_and(np.logical_not(calc_same), np.logical_not(p_same)))
        fn = np.sum(np.logical_and(np.logical_not(calc_same), p_same))
        
        h.save_to_csv(data_rows=[[d, tp, fp, tn, fn,num_true_same,num_true_diff]], outfile_name=out_csv_name, mode="a")
        
        d+=d_step
        if d>d_max:
            break

def evaluate_all_shards(inputs, labels, shard_size,shard_indizes,  results_fn, d_start=0.0, d_step=0.005, d_max=1.0 ):
    num_test_examples = inputs.shape[0]
    for shard_index in shard_indizes:
        shard_x, shard_y = shard_index
        print("Current shard", shard_index)
        start_index_x = shard_x*shard_size
        start_index_y = shard_y*shard_size
        end_index_x = min((shard_x+1)*shard_size, num_test_examples)
        end_index_y = min((shard_y+1)*shard_size, num_test_examples)

        # calcualte pairwise distances
        shard_inputs_x = inputs[start_index_x:end_index_x,:]
        shard_labels_x = labels[start_index_x:end_index_x]

        shard_inputs_y = inputs[start_index_y:end_index_y,:]
        shard_labels_y = labels[start_index_y:end_index_y]

        pw_ji = pairwise_distances(shard_inputs_x,shard_inputs_y, metric="euclidean", n_jobs=8) 

        # evaluate pairwise distances 
        out_csv_name = results_fn+"_%0.2d-%0.2d"%(shard_x, shard_y)
        evaluate_shard(out_csv_name, pw_ji, shard_labels_x, shard_labels_y, d=d_start,  d_step = d_step, d_max=d_max)            
            
def run_evaluation(inputs, labels, shard_size, results_fn, d_start=0.0, d_step=0.005, d_max=1.0):
    results_fn = results_fn%shard_size
    
    num_test_examples = inputs.shape[0]
    num_x = inputs.shape[0]//shard_size
    if not num_test_examples%shard_size==0 :# need to be a square matrix
        print("Allowed shard sizes")
        for i in range(100, num_test_examples):
            if num_test_examples%i==0:
                print(i)
        0/0
    shard_indizes = list(itertools.product(range(num_x),repeat=2))
    num_shards = len(shard_indizes)
    num_distances = len(list(np.arange(d_start,d_max,d_step)))
    num_metrics = 7 
    
    evaluate_all_shards(inputs, labels, shard_size, shard_indizes, results_fn, d_start, d_step, d_max )
    
    all_data = np.ndarray(shape=(num_shards, num_distances, num_metrics), dtype="float32")

    for i, shard_index in enumerate(shard_indizes):
        # load shard
        shard_x, shard_y = shard_index
        out_csv_name = results_fn+"_%0.2d-%0.2d"%(shard_x, shard_y)
        shard_data = h.load_from_csv(out_csv_name)
        shard_data = shard_data[1:] # cut header row 
        all_data[i] = np.array(shard_data)


    final_data  = np.ndarray(shape=(num_distances, 10), dtype="float32")

    final_data[:,0] = all_data[0,:,0] # all distances (are same over all shards)

    final_data[:,1] = all_data.sum(axis=0)[:,1] # True Positives
    final_data[:,2] = all_data.sum(axis=0)[:,2] # False Positives
    final_data[:,3] = all_data.sum(axis=0)[:,3] # True Negatives
    final_data[:,4] = all_data.sum(axis=0)[:,4] # False Negatives
    final_data[:,5] = all_data.sum(axis=0)[:,5] # Num true same (are same over all shards)
    final_data[:,6] = all_data.sum(axis=0)[:,6] # Num true diff  (are same over all shards)

    final_data[:,7] = final_data[:,1]/final_data[:,5] # validation rate 
    final_data[:,8] = final_data[:,2]/final_data[:,6] # false acceptance rate  

    final_data[:,9] = (final_data[:,1] + final_data[:,3]) / (final_data[:,1:1+4].sum(axis=1)) 

    
    h.save_to_csv(data_rows=[[
            "Distance Threshhold",
            "True Positives", 
            "False Positives", 
            "True Negative", 
            "False Negative", 
            "Num true same", 
            "Num true diff", 
            "Validation Rate",
            "False Acceptance Rate",
            "Accuracy"
        ]], outfile_name=results_fn, mode="w", convert_float=False)
    h.save_to_csv(data_rows=final_data, outfile_name=results_fn, mode="a", convert_float=True)

    logger.info("Evaluation done, saved to '%s'"%results_fn)
    return final_data

In [None]:
shards_9460 = run_evaluation(inputs=EMBEDDED_TEST_INPUT, labels=LABELS_TEST, shard_size=SHARD_SIZE, results_fn=RESULTS_FILE, d_step=0.01, d_max=2.0)
# bgl2 9460
# spirit2 
