# Splunk App for Data Science and Deep Learning - Notebook Template 

In [1]:
# this definition exposes all python module imports that should be available in all subsequent commands
import json
import numpy as np
import pandas as pd
import os
import io
import requests
import PyPDF2
from bs4 import BeautifulSoup
# ...
# global constants
MODEL_DIRECTORY = "/srv/app/model/data/"

In [100]:
# Download documentation pdf and save as text files
# Example: "https://docs.splunk.com/index.php?title=Documentation:DSDL:User:InstallDSDL:5.1.1&action=pdfbook&version=5.1.1&product=DSDL"
def get_data_from_pdf(src_path, dest_path):
    response = requests.get(src_path)
    f = io.BytesIO(response.content)
    reader = PyPDF2.PdfReader(f)
    pages = reader.pages
    # get only textual data
    text = "".join([page.extract_text().replace('\n', '') for page in pages])
    text_file = open(dest_path, "w")
    n = text_file.write(text)
    text_file.close()
    returns = {"path": dest_path}
    return returns
# Crawl webpage and save as text files
# Example: 'https://community.splunk.com/t5/Security/500-Internal-Server-Error/m-p/477677'
def get_data_from_website(src_url, dest_path):
    response = requests.get(src_url)
    soup = BeautifulSoup(response.text,features="html.parser")
    list = [i.text.replace("\n", "") for i in soup.findAll({"div":{"class":"texts"}})]
    text = "".join(list)
    text_file = open(dest_path, "w")
    n = text_file.write(text)
    text_file.close()
    returns = {"path": dest_path}
    return returns

In [84]:
src_path = "https://docs.splunk.com/images/9/9a/Splunk-9.1.0-Forwarding_ja-JP.pdf"
dest_path = "data/splunk_doc_jp/data_ingestion_guide.txt"
get_data_from_pdf(src_path, dest_path)

{'path': 'data/splunk_doc_jp/data_ingestion_guide.txt'}

In [10]:
src_path = "https://docs.splunk.com/index.php?title=Documentation:DSDL:User:InstallDSDL:5.1.1&action=pdfbook&version=5.1.1&product=DSDL"
dest_path = "data/splunk_doc/dsdl_guide.txt"
get_data_from_pdf(src_path, dest_path)

{'path': 'data/splunk_doc/dsdl_guide.txt'}

In [3]:
src_path = "https://docs.splunk.com/index.php?title=Documentation:ES:User:Overview:7.0.1&action=pdfbook&version=7.3.1&product=ES"
dest_path = "data/splunk_doc/es_guide.txt"
get_data_from_pdf(src_path, dest_path)

Multiple definitions in dictionary at byte 0x1e1216 for key /I100


{'path': 'data/splunk_doc/es_guide.txt'}

In [108]:
# Splunk JP manual scraping
s = "https://docs.splunk.com/Documentation/Splunk/9.1.4/Translated/Japanesemanuals"
response = requests.get(s)
soup = BeautifulSoup(response.text,features="html.parser")
l = []
for a in soup.find_all('a', href=True):
    if a['href'].startswith("/image"):
        l.append(a['href'])
l = l[1:]
for link in l:
    name = link.split("/")[-1].split("-")[-2]
    path = f'data/splunk_doc_jp/{name}.txt'
    s = f'https://docs.splunk.com{link}'
    print(s)
    get_data_from_pdf(s,path)
    print(path)

https://docs.splunk.com/images/b/b9/Splunk-9.1.0-Alert_ja-JP.pdf
data/splunk_doc_jp/Alert_ja.txt
https://docs.splunk.com/images/c/cf/Splunk-9.1.0-Capacity_ja-JP.pdf
data/splunk_doc_jp/Capacity_ja.txt
https://docs.splunk.com/images/4/4e/Splunk-9.1.0-Viz_ja-JP.pdf
data/splunk_doc_jp/Viz_ja.txt
https://docs.splunk.com/images/c/c8/Splunk-9.1.0-Deploy_ja-JP.pdf
data/splunk_doc_jp/Deploy_ja.txt
https://docs.splunk.com/images/b/b6/Splunk-9.1.0-DMC_ja-JP.pdf
data/splunk_doc_jp/DMC_ja.txt
https://docs.splunk.com/images/d/dc/Splunk-9.1.0-DistSearch_ja-JP.pdf
data/splunk_doc_jp/DistSearch_ja.txt
https://docs.splunk.com/images/9/9a/Splunk-9.1.0-Forwarding_ja-JP.pdf
data/splunk_doc_jp/Forwarding_ja.txt
https://docs.splunk.com/images/3/3c/Splunk-9.1.0-Data_ja-JP.pdf
data/splunk_doc_jp/Data_ja.txt
https://docs.splunk.com/images/4/48/Splunk-9.1.0-Installation_ja-JP.pdf
data/splunk_doc_jp/Installation_ja.txt
https://docs.splunk.com/images/1/1d/Splunk-9.1.0-InheritedDeployment_ja-JP.pdf
data/splunk_doc_

In [90]:
# Go Splunk Page Scraping
s = "https://gosplunk.com/splunk-license-consumption-via-_introspection/"
response = requests.get(s)
soup = BeautifulSoup(response.text,features="html.parser")
body = soup.find("div",{"class":"entry-content clearfix"}).get_text().strip().replace('\n', '')
title = soup.find("h1",{"class":"entry-title"}).get_text().strip().replace('\n', '')
print(title)
print(body)

Splunk License Consumption via _introspection
Just sharing a query I found useful lately when licensing dashboards are being silly. Mileage may vary.index="_introspection" component="licensing.stack"| bucket _time span=1d | stats latest("data.consumption") as dataConsumption latest("data.pools{}.quota") as poolQuota by _time| eval pctUsed=(dataConsumption/poolQuota * 100)| timechart span=1d max(pctUsed)Share This:


In [72]:
# Mitre Enterprise technique scraping

src = "https://attack.mitre.org/techniques/enterprise/"
response = requests.get(src)
soup = BeautifulSoup(response.text,features="html.parser")
l = []
for a in soup.find_all('a', href=True):
    if a['href'].startswith("/techniques/T"):
        l.append(a['href'])

for link in l:
    parts = link.split("/")
    if len(parts) == 4:
        ID = parts[-2]
        sub_ID = parts[-1]
    else:
        ID = parts[-1]
        sub_ID = ''
    s = os.path.join('https://attack.mitre.org/techniques', ID, sub_ID)
    

    response = requests.get(s)
    soup = BeautifulSoup(response.text,features="html.parser")
    title = ""
    for h in soup.find("h1"):
        title += h.get_text().strip().replace('\n', '')
        title += ' '
    body = soup.find("div",{"class":"description-body"}).get_text()
    if sub_ID != '':
        technique = ID + "." + sub_ID
    else:
        technique = ID
    print(title)
    print(technique)
    sentence = f'The title of mitre attack technique {technique} is {title}. This technique is described as follow: {body}'
    file_name = ID + sub_ID
    dest_path = f'data/mitre_enterprise/{file_name}.txt'
    text_file = open(dest_path, "w")
    
    n = text_file.write(sentence)
    text_file.close()
    

Abuse Elevation Control Mechanism 
T1548
Abuse Elevation Control Mechanism 
T1548
 Abuse Elevation Control Mechanism: Setuid and Setgid 
T1548.001
 Abuse Elevation Control Mechanism: Setuid and Setgid 
T1548.001
 Abuse Elevation Control Mechanism: Bypass User Account Control 
T1548.002
 Abuse Elevation Control Mechanism: Bypass User Account Control 
T1548.002
 Abuse Elevation Control Mechanism: Sudo and Sudo Caching 
T1548.003
 Abuse Elevation Control Mechanism: Sudo and Sudo Caching 
T1548.003
 Abuse Elevation Control Mechanism: Elevated Execution with Prompt 
T1548.004
 Abuse Elevation Control Mechanism: Elevated Execution with Prompt 
T1548.004
 Abuse Elevation Control Mechanism: Temporary Elevated Cloud Access 
T1548.005
 Abuse Elevation Control Mechanism: Temporary Elevated Cloud Access 
T1548.005
 Abuse Elevation Control Mechanism: TCC Manipulation 
T1548.006
 Abuse Elevation Control Mechanism: TCC Manipulation 
T1548.006
Access Token Manipulation 
T1134
Access Token Manipulation

## End

In [6]:
# this cell is not executed from MLTK and should only be used for staging data into the notebook environment
def stage(name):
    with open("data/"+name+".csv", 'r') as f:
        df = pd.read_csv(f)
    with open("data/"+name+".json", 'r') as f:
        param = json.load(f)
    return df, param

In [10]:
# initialize your model
# available inputs: data and parameters
# returns the model object which will be used as a reference to call fit, apply and summary subsequently
def init(df,param):
    model = {}
    model['hyperparameter'] = 42.0
    return model

In [12]:
# train your model
# returns a fit info json object and may modify the model object
def fit(model,df,param):
    # model.fit()
    info = {"message": "model trained"}
    return info

In [14]:
# apply your model
# returns the calculated results
def apply(model,df,param):
    y_hat = df.index
    result = pd.DataFrame(y_hat, columns=['index'])
    return result

## Stage 5 - save the model

In [16]:
# save model to name in expected convention "<algo_name>_<model_name>"
def save(model,name):
    with open(MODEL_DIRECTORY + name + ".json", 'w') as file:
        json.dump(model, file)
    return model

## Stage 6 - load the model

In [17]:
# load model from name in expected convention "<algo_name>_<model_name>"
def load(name):
    model = {}
    with open(MODEL_DIRECTORY + name + ".json", 'r') as file:
        model = json.load(file)
    return model

## Stage 7 - provide a summary of the model

In [18]:
# return a model summary
def summary(model=None):
    returns = {"version": {"numpy": np.__version__, "pandas": pd.__version__} }
    return returns

After implementing your fit, apply, save and load you can train your model:<br>
| makeresults count=10<br>
| streamstats c as i<br>
| eval s = i%3<br>
| eval feature_{s}=0<br>
| foreach feature_* [eval &lt;&lt;FIELD&gt;&gt;=random()/pow(2,31)]<br>
| fit MLTKContainer algo=barebone s from feature_* into app:barebone_model<br>

Or apply your model:<br>
| makeresults count=10<br>
| streamstats c as i<br>
| eval s = i%3<br>
| eval feature_{s}=0<br>
| foreach feature_* [eval &lt;&lt;FIELD&gt;&gt;=random()/pow(2,31)]<br>
| apply barebone_model as the_meaning_of_life

## Send data back to Splunk HEC
When you configured the Splunk HEC Settings in the DSDL app you can easily send back data to an index with [Splunk's HTTP Event Collector (HEC)](https://docs.splunk.com/Documentation/Splunk/latest/Data/UsetheHTTPEventCollector). Read more about data formats and options in the [documentation](https://docs.splunk.com/Documentation/Splunk/latest/Data/FormateventsforHTTPEventCollector#Event_metadata).

### Use cases
- you want to offload longer running, possibly distributed computations that need to deliver results asynchroneously back into Splunk. 
- you might not want to present results back into the search pipeline after your `| fit` or `| apply` command. 
- you can easily utilize this approach for any logging purposes or other profiling tasks in your ML code so you can actively monitor and analyze your processes.

### Example

In [18]:
from dsdlsupport import SplunkHEC as SplunkHEC
hec = SplunkHEC.SplunkHEC()

In [19]:
# example to send 10 hello world events
response = hec.send_hello_world(10)

In [20]:
print("HEC endpoint %s \nreturned with status code %s \nand response message: %s" % (response.url, response.status_code, response.text))

HEC endpoint http://host.docker.internal:8088/services/collector/event 
returned with status code 200 
and response message: {"text":"Success","code":0}


In [21]:
# example to send a JSON object, e.g. to log some data
from datetime import datetime
response = hec.send({'event': {'message': 'operation done', 'log_level': 'INFO' }, 'time': datetime.now().timestamp()})

In [22]:
print("HEC endpoint %s \nreturned with status code %s \nand response message: %s" % (response.url, response.status_code, response.text))

HEC endpoint http://host.docker.internal:8088/services/collector/event 
returned with status code 200 
and response message: {"text":"Success","code":0}


## End of Stages
All subsequent cells are not tagged and can be used for further freeform code

In [52]:
import llama_index
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document, StorageContext, ServiceContext
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import textwrap

2024-05-16 09:54:49.804952: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-16 09:54:49.844797: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [53]:
model="llama3"
url = "http://ollama:11434"
llm = Ollama(model=model, base_url=url)

In [54]:
transformer_embedder = HuggingFaceEmbedding(model_name='all-MiniLM-L6-v2')



In [55]:
service_context = ServiceContext.from_defaults(
        llm=llm, embed_model=transformer_embedder, chunk_size=1024
    )

  service_context = ServiceContext.from_defaults(


In [57]:
vector_store = MilvusVectorStore(uri="http://milvus-standalone:19530", token="", dim=384, overwrite=False)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


In [58]:
index = VectorStoreIndex.from_vector_store(
       vector_store=vector_store, service_context=service_context
    )

In [59]:
query_engine = index.as_query_engine()

In [60]:
r = query_engine.query("What command can I use to pull DSDL containers?")

In [61]:
print(r)

Based on the provided context information, you can manually pull the Docker image using the CLI and Docker pull commands as shown in the following examples:

```
docker pull phdrieger/mltk-container-golden-image-cpu:5.1.0
docker pull phdrieger/mltk-container-golden-image-gpu:5.1.0
docker pull phdrieger/mltk-container-rapids:5.1.0
docker pull phdrieger/mltk-container-spark:5.1.0
docker pull phdrieger/mltk-container-river:5.1.0
```

These commands can be used to pull the DSDL containers into your local Docker environment.
