### Downloading Data

In [4]:
import cudf
import cugraph
from cugraph.utilities.utils import is_device_version_less_than
import pandas as pd

from clx.heuristics import ports
import clx.parsers.zeek as zeek
import clx.ip

import pandas as pd
from os import path
import s3fs
from streamz import Stream

In [5]:
S3_BASE_PATH = "rapidsai-data/cyber/clx"
CONN_LOG = "conn.log"

# Download Zeek conn log
if not path.exists(CONN_LOG):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(S3_BASE_PATH + "/" + CONN_LOG, CONN_LOG)

Note, `conn.log` contains a header at the top of the file, which is not needed for this example and we can simply remove it. It also contains a `close` header at the bottom, which we can remove.

In [6]:
!tail -n +9 conn.log | head -n -1 > messages.log

### Following the instructions at https://kafka.apache.org/quickstart to start a Kafka broker

**NOTE:** At the topic creation step, make sure to name the new topic `streamz_n_graph`

In [None]:
# Ingesting data into kafka

!kafka/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic streamz_n_graph < messages.log >/dev/null

In [None]:
# To see the data from the kafka topic

!kafka/bin/kafka-console-consumer.sh --topic streamz_n_graph --from-beginning --bootstrap-server localhost:9092

### Configuring Kafka Stream using custreamz

In [7]:
# Kafka
broker="localhost:9092"
input_topic="streamz_n_graph"
output_topic="output"

producer_conf = {
    "bootstrap.servers": broker,
    "session.timeout.ms": 10000,
}

In [8]:
max_batch_size=100000
poll_interval="1s"

In [9]:
import random

# WHAT DOES THIS MEAN?
# Generate a unique group_id to be able to re-run this demo notebook on the same data loaded to your kafka topic.
j = random.randint(0,10000)
group_id="fil-group-%d" % j

# Kafka consumer configuration
consumer_conf = {
    "bootstrap.servers": broker,
    "group.id": group_id,
    "session.timeout.ms": "60000",
    "enable.partition.eof": "true",
    "auto.offset.reset": "latest",
}

In [10]:
source = Stream.from_kafka_batched(
        input_topic,
        consumer_conf,
        poll_interval=poll_interval,
        npartitions=1,
        asynchronous=True,
        max_batch_size=max_batch_size
)

### Now we know that Kafka is setup correctly, we start customizing our `predict` function for clx

In [11]:
import time

def parse_message(line):
    split_line = line.split(b'\t')
    src, src_p = split_line[2], split_line[3]
    dest, dest_p = split_line[4], split_line[5]
    return (src, src_p, dest, dest_p)
    

In [12]:
edges_gdf = None


def process_batch(messages):
    global edges_gdf
    start_time = time.time()
    src_dest_tuples = list(map(parse_message, messages))
    
    # single pass
    evt_edges_df = cudf.DataFrame({
        'src': [x[0].decode('utf-8') for x in src_dest_tuples],
        'dst': [x[2].decode('utf-8') for x in src_dest_tuples]
    })
    
    # converting to ip
    evt_edges_df['src'] = clx.ip.ip_to_int(evt_edges_df['src'])
    evt_edges_df['dst'] = clx.ip.ip_to_int(evt_edges_df['dst'])
    
    if not edges_gdf:
        edges_gdf = evt_edges_df
    else:
        edges_gdf = cudf.concat([edges_gdf, evt_edges_df])
    
    end_time = time.time()
    time_diff = end_time - start_time
    return (time_diff, evt_edges_df)

In [13]:
def pagerank(message):    
    start_time = time.time()
    
    G = cugraph.Graph()
    G.from_cudf_edgelist(edges_gdf, source="src", destination="dst", renumber=True)    
    
    pr_gdf = cugraph.pagerank(G, alpha=0.85, max_iter=500, tol=1.0e-05)
    pr_gdf['idx'] = pr_gdf['vertex']
    
    end_time = time.time()
    time_diff = end_time - start_time
    
    prev_time = message[0]
    return (prev_time, time_diff)

In [14]:
output = source.map(process_batch).map(pagerank).sink_to_list()

In [15]:
source.start()

### Generating longer synthetic file from `messages.log`

In [17]:
file_content = open('messages.log').read()
factor = 46
messages_sent = 43410 * factor  # 46 * 43410 ~ 2 million

with open('messages_duplicate.log', 'w') as f:
    for i in range(factor):
        f.write(file_content)

### Benchmarking

In [18]:
import subprocess

cumulative_time, total_time = 0, 0
trials = 10
bashCommand = "kafka/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic streamz_n_graph < messages_duplicate.log >/dev/null"


In [19]:
for i in range(trials):
    process = subprocess.Popen(bashCommand, stdout=subprocess.PIPE, cwd='/rapids/clx/my_data', shell=True)
    process.communicate()

In [None]:
print(f'A total of {messages_sent*trials} messages will be sent')

if len(output)*max_batch_size >= messages_sent*trials:
    print('Done')
    print('Average seconds per message:', sum(x[0] + x[1] for x in output)/(messages_sent * trials))
else:
    print('Still running, current average seconds per message:', sum(x[0] + x[1] for x in output)/(messages_sent * trials))