This script generates random HAProxy log data to Kafka. 

You can refer to: https://kafka.apache.org/quickstart to start a local Kafka cluster.

In [1]:
import json
import math
import sys

import numpy as np

from random import randrange

import confluent_kafka
from confluent_kafka import KafkaError, KafkaException
from confluent_kafka import TopicPartition

The function generate_log() creates a random HAProxy logline based on the logline skeleton, field names, and sample field values.  

In [2]:
sample_data = {}
sample_data['log_ip'] = ['10.0.0.3', '14.2.3.4', '15.2.6.9', '10.2.34.6', '10.23.34.1']
sample_data['syslog_timestamp'] = ['May 28 2019 00:00:09', 'May 28 2019 00:00:10', 'May 28 2019 00:00:11',\
                                   'May 28 2019 00:00:39','May 28 2019 00:00:51', 'May 28 2019 00:10:09']
sample_data['program'] = ['haproxy']
sample_data['pid'] = [113345, 756487, 352453, 352465, 164541]
sample_data['client_ip'] = ['156.23.224.56', '126.52.74.15', '247.81.56.21', '26.245.255.1', '255.116.145.2']
sample_data['client_port'] = [13345, 56487, 52453, 52465, 64541]
sample_data['accept_date'] = ['28/May/2019:00:10:09.492', '28/May/2019:00:09:10.006', '28/May/2019:00:02:10.748',\
                              '28/May/2019:00:20:10.891', '28/May/2019:00:02:10.461', '28/May/2019:00:02:11.959']
sample_data['frontend_name'] = ['px-http', 'https:443', 'tx-http']
sample_data['server_name'] = ['srv1', 'srv2', 'srv3', 'srv4', 'srv5']
sample_data['time_request'] = [0, 1, 2, 3]
sample_data['time_queue'] = [0, 1, 2, 3]
sample_data['time_backend_connect'] = [1, 2, 3]
sample_data['time_backend_response'] = [2, 3, 4, 5, 6, 7, 8, 9]
sample_data['time_duration'] = [13, 14, 16, 20, 23, 25]
sample_data['http_status_code'] = [200, 400, 201, 401, 403]
sample_data['bytes_read'] = [4, 573, 442, 234, 124, 1567]
sample_data['captured_request'] = ['-']
sample_data['captured_response'] = ['-']
sample_data['termination_state'] = ['----', 'PH--', 'CR--', '--NI', '--SG']
sample_data['actconn'] = [1, 2, 3, 4]
sample_data['feconn'] = [2, 3, 5, 7, 8]
sample_data['beconn'] = [0, 1, 2, 3, 4]
sample_data['srvconn'] = [0, 1, 3]
sample_data['retries'] = [0, 1, 2]
sample_data['srv_queue'] = [0, 1, 2, 3]
sample_data['backend_queue'] = [0, 2, 3, 4, 5, 7, 8, 9]


cols = ['log_ip','syslog_timestamp','program','pid','client_ip','client_port',\
        'accept_date','frontend_name','backend_name','server_name','time_request',\
        'time_queue','time_backend_connect', 'time_backend_response', 'time_duration',\
        'http_status_code', 'bytes_read', 'captured_request', 'captured_response',\
        'termination_state','actconn','feconn','beconn','srvconn','retries','srv_queue','backend_queue']

def generate_log():
    log_skelton = "[haproxy@{0}] <134>{1} {2}[{3}]: {4}:{5} [{6}] {7} {8}/{9} {10}/{11}/{12}/{13}/{14} {15} {16} {17} {18} {19} {20}/{21}/{22}/{23}/{24} {25}/{26}"
    values = []
    for idx, col in enumerate(cols):
        if col in sample_data:
            value_list = sample_data[col]
            values.append(value_list[randrange(len(value_list))])
        else:
            values.append(values[-1])
    dict_out = {}    
    dict_out["logline"] = log_skelton.format(*values)
    return json.dumps(dict_out)

Below is the example of a logline generated by generate_log().

In [3]:
generate_log()

'{"logline": "[haproxy@15.2.6.9] <134>May 28 2019 00:00:51 haproxy[352465]: 26.245.255.1:52465 [28/May/2019:00:02:10.461] px-http px-http/srv3 0/1/2/7/20 200 573 - - PH-- 4/8/1/1/0 0/2"}'

Let's now create a Kafka producer.

In [4]:
producer_conf = {'bootstrap.servers': 'localhost:9092', \
                'compression.type':'snappy'}
producer = confluent_kafka.Producer(producer_conf)
topic = "random-haproxy"

def delivery_callback(err, msg):
        if err:
            sys.stderr.write('%% Message failed delivery: %s\n' % err)
        else:
            sys.stderr.write('%% Message delivered to %s [%d] @ %o\n' %
                             (msg.topic(), msg.partition(), msg.offset()))

Let's now generate 5k random loglines to Kafka.

In [5]:
count = 0
try:
    while count < 5000:
        producer.produce(topic, generate_log(), callback=delivery_callback)
        count = count + 1
        if count % 100000 == 0:            
            producer.flush()
except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')