This workbook generates sample data for the app and writes to the sample directory. 

In [50]:
# Set up the global variables for the script

import json
import os
import pandas
import math
import random
import datetime
import uuid
import string
from os import listdir
from os.path import isfile, join


data_directory="appserver/static/sample_data"

""" 
For events we have selected a selection of Splunk T shirt sloans. This list was obtained by searching the web, it is not a definitive list and I suspect many were never printed :-)
"""
log_lines = open(data_directory+"/splunk_slogans.txt","r").read().splitlines()

"""
The script generates events randomly over a time range, by default this goes back 5 days and generates a 1000 events each time.
"""
date_range_days=2
sample_readings=1000

# Get an random array of datetime objects going backwards in time, sorted oldest first
def get_dates(sample_readings : int, max_days_ago : int) : 
    datetimes = []
    for i in range(0,sample_readings): 
        random_seconds = random.randrange(1, max_days_ago*24*60*60)
        my_timedelta=datetime.timedelta(seconds=-random_seconds, milliseconds=random.randint(0,9999))
        my_datetime=datetime.datetime.now()+my_timedelta
        datetimes.append(my_datetime)
    # Sort the dates into reverse chroniclogical order as they would appear in a log file
    datetimes.sort(reverse=False)
    return datetimes

"""
This generates a list of events where the time stamps switches between 3 different timestamps.

This is a common problem in badly designed Splunk instances. People open a TCP port and then fire all sorts of different data in there. 

Ideally we would create multiple sourcetypes and then assign a TCP port for each sourcetype. However the example shows you how to patch the problem during ingestion.
"""
def conflicting_datetime_formats(sample_readings : int, max_days_ago : int) :

    # our three different date time formats
    datetime_format = ["%Y-%m-%d %H:%M:%S", "%H:%M:%S %y-%m-%d", "%c"]

    # create out output file
    mutliplexed_datetime_formats = open(data_directory+"/conflicting_datetime_formats/mutliplexed_datetime_formats.log","w")

    # iterate through the list of date timesn and write out to disk
    for my_datetime in get_dates(sample_readings, max_days_ago) :
        # select a timeformat at random and use it
        time=my_datetime.strftime(random.choice(datetime_format))
        # pick a random log line to use
        message=random.choice(log_lines)
        # write out the log file
        mutliplexed_datetime_formats.write(time+" "+message+"\n")

    # close and flush the file
    mutliplexed_datetime_formats.close()


"""
This script generates events where the date is embedded in the file name, but the timestamp is per event im the contents of the file.

We are going to create a map of dates to times, so that we can itterate through each day, create a file and fill with events for that day

To work around any weird rounding errors due to timezones we will generate the day, and the seconds separately
"""
def compound_datetimes(sample_readings : int, max_days_ago : int) : 
    # create our map for the date to timings mapping
    date_map = {}

    path = data_directory+"/compound_datetimes/"

    # lets delete the existing files or they will build up
    for file in [f for f in listdir(path) if isfile(join(path, f))] :
        os.remove(path+file)

    # populate our map
    for my_datetime in get_dates(sample_readings, max_days_ago) :
        # get the date component from the datetime object
        day=my_datetime.strftime("%Y-%m-%d")
        if day not in date_map :
            date_map[day] = []
        date_map[day].append(my_datetime.strftime("%H:%M:%S.%f"))            

    # itterate through all the days and print out the times with a random log message
    for my_day in date_map.keys() :
        filename=path+my_day+".log"    
        # filename for the days events, named after the day "2020-02-12.log"
        file_for_day = open(filename,"w")
        for my_time in date_map[my_day] :
            # write out the timestamp with a random log message
            file_for_day.write(my_time + " " + random.choice(log_lines)+ "\n")


"""
This creates log lines with follow an attribute=value pattern using different and no quotes

Specify the minimum and maximum number of av pairs per log line
"""
def auto_extract_indexed_fields(sample_readings : int, max_days_ago : int,  min_values : int, max_values : int) : 

    # a list of variable names for us to pull from, complete with a type field
    variable_names= [('stdev_kbps',float),('average_kbps',float), ('sum_kbps',int), ('label',str), ('name',str), ('group',str), ('value',int)]
    # a list of string values for us to pull from when building events
    labels = ['no_quotes',"'single quotes'",'"double quotes"']

    # We don't want some n00b specifying more AV pairs than we have in our sample group or we run out!
    if (max_values>len(variable_names)) :
        max_values=len(variable_names)

    # open our output file
    indexed_log = open(data_directory+"/auto_extract_indexed_fields/indexed.log","w")

    # get a selection of date times
    for my_datetime in get_dates(sample_readings, max_days_ago) :

        # We don't want the same variable printed multiple times, this would result in multivalue fields   
        # Copy the our list of possible AV pairs     
        my_variables_names=variable_names.copy()
        # shuffle that list so they occur in a random order
        random.shuffle(my_variables_names)

        # we need an message to return, we will put the time stamp at the front
        message = my_datetime.strftime("%Y-%m-%d %H:%M:%S") 

        # We will append a random number of AV pairs to the message
        for i in range(0,random.randint(min_values,max_values)) :
            # pop off the variable name and the type that we are going to use
            (variable_name, variable_type) = my_variables_names.pop(1)
            # write out the variable name
            message = message + " " + variable_name + "=" 
            # depending the type select a value
            if (variable_type == str) : 
                message = message + random.choice(labels)
            elif (variable_type == float) :
                message = message + str(random.random())
            elif (variable_type == int) :
                message = message + str(random.randint(0,9999))

        # write out the log name
        indexed_log.write(message+"\n")

    # close the file
    indexed_log.close()

"""
This script generates a data set for importing into directly into splunk. We have create sourcetype, source, host, index and then use INGEST_EVAL + REGEX to extract the fields and copy them into the relevant fields. 

The format aims to replicated the output of the following splunk search:
"""
def load_into_indexes(sample_readings : int, max_days_ago : int) :

    # for this demo we needs some target indexes, sourcestypes, sources and hosts

    # these indexes have been created in indexes.conf
    indexes=['ingest_eval_examples_1', 'ingest_eval_examples_2']
    # sourcetypes should have only one date format, lets match these together, these don't need to be defined in props.conf as the timestamp is written out in EPOCH
    sourcetypes=[('load_into_indexes:fruit', "%c"), ('load_into_indexes:beef', "%Y-%m-%d %H:%M:%S"), ('load_into_indexes:fish', "%H:%M:%S %y-%m-%d"), ('load_into_indexes:chicken', "%d %a %Y %H:%M:%S")]
    # a selection of values for source
    sources=['load_into_indexes:sea', 'load_into_indexes:ground', 'load_into_indexes:sky', 'load_into_indexes:tree']
    # a selection of values for host
    hosts=['load_into_indexes:farm_shop', 'load_into_indexes:online', 'load_into_indexes:super_market', 'load_into_indexes:market']
    # we also need something to separate the data
    sep="%%%"

    # open the output log file to write data too
    import_events = open(data_directory+'/load_into_indexes/encoded_splunk_events.csv',"w")

    # fake the header row
    import_events.write('"_raw\n"')

    # get a selection of date times
    for my_datetime in get_dates(sample_readings, max_days_ago) :
        # pick a sourcetype adn 
        (sourcetype, datetime_format) = random.choice(sourcetypes)
        # get the epoch numeric value for the datetime
        time=str(my_datetime.timestamp())
        # pick a random host, source and index for the log line
        host=random.choice(hosts)
        source=random.choice(sources)
        index=random.choice(indexes)
        # generate the log line prefixed by the timestamp
        raw=my_datetime.strftime(datetime_format)+" "+random.choice(log_lines)
        # write out the line to be written into the import file
        import_events.write('"' + time + sep + index + sep + host + sep + source + sep + sourcetype + sep + raw + '^^^END^^^"\n')

    # write to the output file
    import_events.close()

""" This function generates a text file with user names and passwords so we can demonstrate how to mask data
"""
# a selection of comedy passwords, for more bad passwords I recommend the excellent https://bad.pw/
terrible_passwords = ["password", "p@$$w0rd", "qwerty123", "aaaaa", "admin", "0000", "letmein", "onetimepassword", "assword", "abc123", "123456", "password1", "iloveyou", "trustno1", "iambatman"]
email_addresses = ["admin@ghcq.com", "007@mi5.gov.uk", "putin@gru.ru", "director@cia.us.gov", "julian@wikileaks.com", "jbourne@ucia.gov"]

def mask_and_clone(sample_readings : int, max_days_ago : int) :

    joke_security=open(data_directory+"/mask_and_clone/insecurity.log", "w")

    for my_datetime in get_dates(sample_readings, max_days_ago) :
        joke_security.write(my_datetime.strftime("%H:%M:%S %y-%m-%d") + " my email_address=" + random.choice(email_addresses) + ' and my terrible password="' + random.choice(terrible_passwords) + '"\n')
   
    joke_security.close()


def mask_data_and_map(sample_readings : int, max_days_ago : int) :

    joke_security=open(data_directory+"/mask_data_and_map/insecurity.log", "w")

    for my_datetime in get_dates(sample_readings, max_days_ago) :
        joke_security.write(my_datetime.strftime("%H:%M:%S %y-%m-%d") + " my email_address=" + random.choice(email_addresses) + ' and my terrible password="' + random.choice(terrible_passwords) + '"\n')
   
    joke_security.close()



"""
This script generates a csv with 'useless' columns that we don't want to add into tsidx because they will bloat the size of the bucket.

We use pandas to build the CSV file, set headers etc
"""
def drop_indexed_fields(sample_readings : int, max_days_ago : int)  :
    # create a pandas with some column headings describing the contents
    useless_columns=pandas.DataFrame(columns=['time','primary_key', 'primary_value', 'repeated_field', 'random_nonsense', 'long_payload'])

    # Create rows and assign values to the columns
    for my_date in get_dates(sample_readings, max_days_ago) :
        useless_columns=useless_columns.append({'time': my_date, 'primary_key': hex(random.randint(0,pow(2,16))), 'primary_value': random.randint(0,999999), 'repeated_field': "same silly value", 'random_nonsense' : uuid.uuid4(), 'long_payload' : random.choice(log_lines)}, ignore_index=True)

    # write out the CSV file
    useless_columns.to_csv(data_directory+'/drop_indexed_fields/useless_columns.csv', sep=',', encoding='utf-8', index=False)

"""
This generates a simple log file for split forwarding, it has no interesting attributes
"""
def split_forwarding(sample_readings : int, max_days_ago : int) :

    # create out output file
    mutliplexed_datetime_formats = open(data_directory+"/split_forwarding/simple_events.log","w")

    # iterate through the list of date timesn and write out to disk
    for my_datetime in get_dates(sample_readings, max_days_ago) :
        # get the date as a string
        time=my_datetime.strftime("%Y-%m-%d %H:%M:%S")
        # pick a random log line to use
        message=random.choice(log_lines)
        # write out the log file
        mutliplexed_datetime_formats.write(time+" "+message+"\n")

    # close and flush the file
    mutliplexed_datetime_formats.close()

"""
In splunk 8.1 we introduced thread name and ids to splunk logs, this function generates events in the splunkd format to test both
formatting styles so that we are able read both cleanly
"""
def enrich_splunkd_component_level_thread(sample_readings : int, max_days_ago : int) :

    # 09-21-2020 09:51:06.241 +0000 INFO  TailReader [10099 batchreader1] - Starting batchreader1 thread
    # 09-21-2020 09:50:57.229 +0000 INFO  CMSlave [9659 MainThread] - starting heartbeat thread
    # 09-21-2020 09:50:57.229 +0000 INFO  CMSlave - starting heartbeat thread

    components = ["ApplicationLicense", "ApplicationManager", "ApplicationUpdater", "BucketMover", "BucketReplicator", "BundleJob", "BundlesSetup", "CMBucketId", "CMBundleMgr", "CMHeartbeatThread", "CMRepJob", "CMReplicationRegistry", "CMServiceThread", "CMSlave", "CacheManager", "CascadingReplicationManager", "CertStorageProvider", "ClientSessionsManager", "ClusterBundleValidator", "ClusterSlaveConfigReloader", "ClusterSlaveControlHandler", "ClusteringMgr", "DC:DeploymentClient", "DSManager", "DS_DC_Common", "DatabaseDirectoryManager", "DeploymentServer", "ExecProcessor", "HotBucketRoller", "HotDBManager", "IndexProcessor", "IndexWriter", "IndexerIf", "IndexerInit", "IndexerService", "IntrospectionGenerator:disk_objects", "IntrospectionGenerator:resource_usage", "KVStoreBackupRestore", "KeyManagerSearchPeers", "LMConfig", "LMSlaveInfo", "LMStackMgr", "LMTracker", "LicenseMgr", "MPool", "MetricAlertManager", "MetricSchemaProcessor", "MetricsProcessor", "ModularInputs", "MultiFactorAuthManager", "NoahHeartbeat", "PipeFlusher", "PipelineComponent", "ProxyConfig", "PubSubSvr", "RemoteQueueInputProcessor", "Rsa2FA", "S2SFileReceiver", "SHClusterMgr", "ScheduledViewsReaper", "ServerConfig", "ServerRoles", "ShutdownHandler", "SpecFiles", "StorageInterface", "StreamingBucketBuilder", "TailReader", "TailingProcessor", "TcpInputConfig", "TcpInputProc", "TcpOutputProc", "TelemetryHandler", "UDPInputProcessor", "UiHttpListener", "WatchedFile", "WorkloadManager" ]

    log_levels = ['INFO', 'WARN', 'ERROR', 'DEBUG']

    roles = ['sh-i-','shc-i-','idx-i-','c0m1-i-','idm-i-']

    threads=["All_CMExecutorsShutdownThread", "AppLicenseThread", "AuditSearchExecutor", "BundleExecutorWorker-0", "CMExecutorWorker-0", "CMExecutorWorker-1", "CMExecutorWorker-10", "CMExecutorWorker-2", "CMExecutorWorker-3", "CMExecutorWorker-4", "CMExecutorWorker-5", "CMExecutorWorker-6", "CMExecutorWorker-7", "CMExecutorWorker-8", "CMExecutorWorker-9", "CMHealthManager", "CMHeartbeatThread", "CMNotifyThread", "CMSlaveShutdownThread", "CMSynchronousExecutorWorker-0", "CMUploadReplicatedBucketThread", "CallbackRunnerThread", "DispatchReaper", "ExecProcessorSchedulerThread", "FilesystemOpExecutorWorker-0", "FilesystemOpExecutorWorker-1", "FilesystemOpExecutorWorker-2", "FilesystemOpExecutorWorker-3", "FilesystemOpExecutorWorker-4", "FilesystemOpExecutorWorker-5", "FilesystemOpExecutorWorker-6", "FilesystemOpExecutorWorker-7", "FlusherThread", "HTTPDispatch", "HttpDedicatedIoThread-1", "HttpDedicatedIoThread-3", "HttpDedicatedIoThread-4", "IndexInitExecutorWorker-0", "IndexInitExecutorWorker-1", "IndexerService", "IndexerTPoolWorker-0", "IndexerTPoolWorker-1", "KVStoreBackupThread", "Killa", "MainTailingThread", "MainThread", "ReplicationDataReceiverThread", "SchedulerThread", "Shutdown", "SplunkdSpecificInitThread", "TcpChannelThread", "TcpListener", "TcpOutEloop", "WebuiStartup", "batchreader0", "batchreader1", "cachemanagerDownloadExecutorWorker-0", "cachemanagerDownloadExecutorWorker-1", "cachemanagerDownloadExecutorWorker-2", "cachemanagerDownloadExecutorWorker-3", "cachemanagerDownloadExecutorWorker-4", "cachemanagerDownloadExecutorWorker-5", "cachemanagerDownloadExecutorWorker-6", "cachemanagerDownloadExecutorWorker-7", "cachemanagerDownloadExecutorWorker-8", "cachemanagerUploadExecutorWorker-0", "cachemanagerUploadExecutorWorker-1", "cachemanagerUploadExecutorWorker-10", "cachemanagerUploadExecutorWorker-2", "cachemanagerUploadExecutorWorker-3", "cachemanagerUploadExecutorWorker-4", "cachemanagerUploadExecutorWorker-5", "cachemanagerUploadExecutorWorker-6", "cachemanagerUploadExecutorWorker-7", "cachemanagerUploadExecutorWorker-8", "cachemanagerUploadExecutorWorker-9", "exec_1", "indexerPipe_0", "indexerPipe_1", "journal-compress", "remotequeueinput_0", "remotequeueinput_1", "tailreader0", "tailreader1", "tcp_0", "tcp_1", "typing_0", "typing_1"]

    enrich_splunkd = open(data_directory+'/enrich_splunkd_component_level_thread/multi_log_formats.log','w')

    for my_datetime in get_dates(sample_readings, max_days_ago) :
        time=my_datetime.strftime("%d-%m-%Y %H:%M:%S.%f")[0:23]
        log_level=random.choice(log_levels)
        component=random.choice(components)
        thread_info="["+str(random.randrange(10,9999))+" "+random.choice(threads)+"]"
        enrich_splunkd.write(time+" +0000 "+log_level+"  "+component+" "+random.choice(["",thread_info])+" - "+random.choice(log_lines)+'\n')

    enrich_splunkd.close()

# generate some sample data for enriching splunkd_access.log
def enrich_splunkd_access_log(sample_readings : int, max_days_ago : int) :

    # example log line
    # 127.0.0.1 - splunk-system-user [22/Sep/2020:09:51:03.999 +0000] "POST "POST /services/admin/cacheman/bid|_audit~418~35EAC499-A711-4E8C-8E33-5CB78009D53A|/close?output_mode=json&sid=remote_sh-i-066cb06a2a60e5d07.blah.splunkcloud.com_subsearch_1600768261.138727_BDC68508-A6EF-4FC4-BE64-487E7AB8EB1E_1600768262.1&miss_ms=0 HTTP/1.1" 200 1976 - - - 0ms

    ips = [ "10.0."+str(random.randrange(1,254))+"."+str(random.randrange(1,254)) for i in range(0,10) ] + 5*["127.0.0.1"]
    user = ["splunk-system-user", "internal_monitoring", "admin", "bob.smith@thesmiths.com", "tony@stark.com", "hulk@greenmachine.com"]

    sample = open(data_directory+"/enrich_splunkd_access_log/splunkd_access.log.sample",'r').readlines()
    enrich_splunkd_access_log = open(data_directory+"/enrich_splunkd_access_log/enrich_splunkd_access.log",'w')

    for i in get_dates(sample_readings,max_days_ago) :
        log_line = random.choice(sample)
        new_line=log_line.replace("$date$", i.strftime("%d/%b/%Y:%H:%M:%S.%f")[0:24]+random.choice([" +0000"," -0000"," -0800"," +0600"," -1200"," +1100"," -0030"," +0145"])).replace("$user$",random.choice(user)).replace("$ip$",random.choice(ips))
        enrich_splunkd_access_log.write(new_line)

    enrich_splunkd_access_log.close()


def shard_data_with_splitbyindexkeys(sample_readings : int, max_days_ago : int, number_of_hosts : int) :

    # we will emulate splunk cloud customer names 

    # a selection of roles to compose splunk instance names from
    roles = ["shc-i-","idx-i-", "c0m1-i-","sh-i-", "idm-i-"]
    # create our list of domain names
    domains = ['.splunkcloud.com','.stg.splunkcloud.com']
    # create a list of dummy customer names for the "stacks"
    companies_names = ['Brawndo','Parallax','Megadodo','Adventureland','PierceandPierce','acme','umbrella','Initech','Rekall','SPECTRE','tyrell','The Daily Planet','Stark Industries','Cyberdyne Systems','Wayne Enterprises']

    # create host names and roles, using the dummy names provided
    hosts=[]
    for i in range(0,number_of_hosts) :
        # fake the amazon instance id
        instance = ''.join(random.choice('0123456789abcdef') for n in range(17))
        # pick a random role
        role= random.choice(roles)
        # pick a random domain
        domain=random.choice(domains)
        # convert the company name to a stack name
        company=random.choice(companies_names).lower().replace(' ','')
        # add our fake host to the list of hosts
        hosts.append(role+instance+'.'+company+domain)

    # load our frequency data
    df = pandas.read_csv(data_directory+"/shard_data_with_splitbyindexkeys/splunkd_frequency_analysis.csv")

    # sort by the count (frequency), smallest to largest
    sorted_df = df.sort_values("count")

    # get our total search space, i.e. get the number of events found
    total=df["count"].sum()

    # Add a rolling sum to use as our index
    sorted_df["rolling_sum"] = sorted_df.expanding(2).sum()

    # we also need something to separate the data
    sep="%%%"

    # open the output log line
    shard_data_with_splitbyindexkeys = open(data_directory+"/shard_data_with_splitbyindexkeys/shard_splunkd.log","w")

    # place holders for the results for my_component and source
    my_component = ""
    my_source = ""

    for my_datetime in get_dates(sample_readings, max_days_ago) :
        for (index,row) in sorted_df.iterrows() :
            if row.rolling_sum >= random .randrange(1,total) :
                my_component = row.component
                my_source = row.source
                break

        # get the epoch numeric value for the datetime
        time=str(my_datetime.timestamp())[0:20]
        # pick a random host, source and index for the log line
        host=random.choice(hosts)
        # generate the log line prefixed by the timestamp
        raw=my_datetime.strftime("%d-%m-%Y %H:%M:%S.%f")+" +0000 INFO  " + my_component + " - " + random.choice(log_lines)
        # write out the line to be written into the import file
        shard_data_with_splitbyindexkeys.write('"' + time + sep + "shard_data_with_splitbyindexkeys" + sep + host + sep + my_source + sep + "splunkd" + sep + raw + '^^^END^^^"\n')

    # finished writing out the file
    shard_data_with_splitbyindexkeys.close()

def name_clash_zoom_data(sample_readings : int, max_days_ago : int) :

    user_type = ["Basic", "Basic|Webinar500", "Licensed", "Licensed|Large500", "Licensed|Webinar1000", "Licensed|Webinar10000|Large500", "Licensed|Webinar1000|Large500", "Licensed|Webinar3000", "Licensed|Webinar3000|Large500", "Licensed|Webinar500", "Licensed|Webinar5000", "Licensed|Webinar5000|Large500", "Licensed|Webinar500|Large500", "On-Prem", "On-Prem|Webinar500", "Unknown", "Unknown|Webinar500|Large500"]

    name_clash=open(data_directory+"/name_clash/naughty_zoom_host.log", "w")

    for my_datetime in get_dates(sample_readings, max_days_ago) :

        meeting_duration=datetime.timedelta(hours=random.choice(range(1,3)), minutes=random.choice(range(-30,30)))

        my_json = { 
            "host" : random.choice(email_addresses),
            "uuid" : ''.join(random.choice(string.ascii_letters+"+=/") for n in range(22))+'==',
            "id": random.randrange(100000000,99999999999),
            "topic" : random.choice(log_lines),
            "email" : random.choice(email_addresses),
            "user_type" : random.choice(user_type),
            "start_time" : my_datetime.strftime("%Y-%m-%dT%T"),
            "end_time" : (my_datetime+meeting_duration).strftime("%Y-%m-%dT%T"),
            "duration" : str(meeting_duration),
            "participants": round(random.triangular(2,50,0.1)),
            "has_pstn" : random.choice([True, False]),
            "has_voip" : random.choice([True, False]),
            "has_3rd_party_audio" : random.choice([True, False]),
            "has_video" : random.choice([True, False]),
            "has_screen_share" : random.choice([True, False]),
            "has_recording" : random.choice([True, False]),
            "has_sip" : random.choice([True, False])
                }

        name_clash.write(json.dumps(my_json)+"\n")
    name_clash.close()

def json_logs(sample_readings : int, max_days_ago : int) :

    # {"log":"I1026 03:58:37.356178 1 binarylog.go:274] rpc: flushed binary log to \"\"\n","stream":"stderr","time":"2020-10-26T03:58:37.356327831Z"}
    # {"log":"Log line is here\n","stream":"stdout","time":"2019-01-01T11:11:11.111111111Z"}

    name_clash=open(data_directory+"/json_docker/json_docker.log", "w")

    for my_datetime in get_dates(sample_readings, max_days_ago) :
        log = random.choice(log_lines)
        time = my_datetime.strftime("%Y-%m-%dT%H:%M:%S")+str(random.randrange(0,999999999))+"Z"
        stream = random.choice(["stdout","stderr"])
        json = '{"log":"'+log+'","stream":"'+stream+'","time":"'+time+'"}'
        name_clash.write(json+"\n")

    name_clash.close()

# this function returns things that look like IP addresses, but frequently aren't, this is to demonstrate how rejection of ip addresses work when they are embedded 
# in other messages
def get_ip_address() :
    
    # lets some times have ips that occur a lot
    if random.choice(range(100)) == 0 :
        return random.choice(["1.2.3.4","8.8.8.8","10.0.0.1"])
    else :
        # the use of traingular means that most ip address occurs with segments under 255, but occasionally ip address addresses are generated 
        return ".".join([ str(round(random.triangular(0,100,800))) for i in range(random.choice([4 for i in range(20)]+[3,5,6]))])

# Create some messages that randomly contain things that look like ip addresses to demonstrate how regex can be used to lift ip addresses
def find_ip_addresses(sample_readings : int, max_days_ago : int) :

    # open our file
    may_contain_ips=open(data_directory+"/find_ip_addresses/may_contain_ips.log", "w")

    # lets prefix the ip adresses with some junk
    prefix = ['ip=','dest=','http://','source=','from=','to=']

    # for all our sample dates
    for my_datetime in get_dates(sample_readings, max_days_ago) :
        message=""
        # lets have up to 5 ip addresses per event
        for i in range(random.randrange(1,5)) :
            ip=get_ip_address()
            message=message+" "+random.choice(prefix)+ip 
        raw=my_datetime.strftime("%d-%m-%Y %H:%M:%S")+message+" "+(" ".join([ str(round(random.triangular(0,100,800))) for i in range(random.choice([random.randint(4,20) for i in range(20)]+[3,5,6]))]))
        may_contain_ips.write(raw+' \n')

    # close our file
    may_contain_ips.close()

In [51]:
conflicting_datetime_formats(100,7)
drop_indexed_fields(100,7)
compound_datetimes(500,7)
auto_extract_indexed_fields(100,7,2,5)
load_into_indexes(1000,7)
mask_and_clone(1000,7)
load_into_indexes(100,5)
enrich_splunkd_component_level_thread(100,7)
enrich_splunkd_access_log(1000,7)
shard_data_with_splitbyindexkeys(100,2,7)
mask_data_and_map(100,7)
mask_and_clone(100,7)
# name_clash_zoom_data(100,6)
json_logs(100,5)
find_ip_addresses(1000,300)