# Hashing Encoder Difference: Differences In Tokenized Strings

Hashing Encoder Difference: Differences In Tokenized Strings

## Stage 0 - import libraries
At stage 0 we define all imports necessary to run our subsequent code depending on various libraries.

In [None]:
# this definition exposes all python module imports that should be available in all subsequent commands
import json
import numpy as np
import pandas as pd
import os
import re
#from hashlib import md5
import hashlib
from base64 import encode
# ...
# global constants
MODEL_DIRECTORY = "/srv/app/model/data/"

In [None]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing or development purposes
print("numpy version: " + np.__version__)
print("pandas version: " + pd.__version__)

## Stage 1 - get a data sample from Splunk


In [None]:
# this cell is not executed from MLTK and should only be used for staging data into the notebook environment
def stage(name):
    with open("data/"+name+".csv", 'r') as f:
        df = pd.read_csv(f)
    with open("data/"+name+".json", 'r') as f:
        param = json.load(f)
    return df, param

In [None]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing or development purposes
df, param = stage("hashing_encoder_difference")

In [None]:
df

In [None]:
param

## Stage 2 - create and initialize a model

In [None]:
def init(df,param):
    model = {}
    return model

In [None]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing or development purposes
model = init(df,param)
print(model)

## Stage 3 - fit the model

In [None]:
def fit(model,df,param):
    return df

In [None]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing or development purposes
#print(fit(model,df,param))
returns = fit(model,df,param)
print(returns)

## Stage 4 - apply the model

In [None]:
def apply(model,df,param):
    
    field_name = param['feature_variables'][0]
    
    comparison_string = param['options']['params']['comparison_string']
    
    try:
        regex=param['options']['params']['tokenizing_regex']
    except:
        regex=r'[^\w\s]'
    
    # define the regex pattern used for tokenisation
    pattern = re.compile(regex)
    
    # Define a new row
    new_row = {field_name: [comparison_string]}

    # Append the row
    df = pd.concat([df,pd.DataFrame(new_row)], ignore_index=True)
    
    # split results up using the regex pattern to create a list of tokens
    df_tokens = df[field_name].astype(str).apply(lambda x: pattern.split(x))
    # apply strip to each list item (token) to remove leading and trailing whitespace
    df_tokens = df_tokens.apply(lambda x: [item.strip() for item in x])
    df_tokens = df_tokens.apply(lambda row: [item for item in row if item != ''])
    
    reference_tokens = df_tokens.iloc[-1]
    df_tokens = df_tokens.drop(df_tokens.index[-1])
    
    df_missing = pd.DataFrame()
    df_missing['reference_tokens_missing'] = pd.DataFrame(df_tokens.apply(lambda row: [item for item in reference_tokens if item not in row]))
    df_missing['event_tokens_missing'] = pd.DataFrame(df_tokens.apply(lambda row: [item for item in row if item not in reference_tokens]))
    
    return df_missing
    

In [None]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing or development purposes
result = apply(model,df,param)
print(result)

## Stage 7 - provide a summary of the model

In [None]:
# return a model summary
def summary(model=None):
    returns = {"version": {"numpy": np.__version__, "pandas": pd.__version__} }
    return returns