# Send Trades To Kafka

This notebook will read the `./tradesMarch.csv` file to read trading events, and will send the events to Apache Kafka. Data will be then processed by Kafka Connect and will eventually end up on a QuestDB table.

We first create the QuestDB table. It would automatically be created if it didn't exist in any case, but this way we can see the schema.

In [1]:
#ignore deprecation warnings in this demo
import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

In [2]:
import psycopg as pg
import os

# Fetch environment variables with defaults
host = os.getenv('QDB_CLIENT_HOST', 'questdb')
port = os.getenv('QDB_CLIENT_PORT', '8812')
user = os.getenv('QDB_CLIENT_USER', 'admin')
password = os.getenv('QDB_CLIENT_PASSWORD', 'quest')

# Create the connection string using the environment variables or defaults
conn_str = f'user={user} password={password} host={host} port={port} dbname=qdb'

with pg.connect(conn_str, autocommit=True) as connection:
    with connection.cursor() as cur:
        cur.execute(
        """
        CREATE TABLE IF NOT EXISTS  'trades' (
  symbol SYMBOL capacity 256 CACHE,
  side SYMBOL capacity 256 CACHE,
  price DOUBLE,
  amount DOUBLE,
  timestamp TIMESTAMP
) timestamp (timestamp) PARTITION BY DAY WAL DEDUP UPSERT KEYS(timestamp, symbol, side);
""")
                    


## Sending the data to Kafka

Now we read the `./tradesMarch.csv` file and we convert every row to Avro binary format before we send to Kafka into a topic named `trades`.

By default, the script will override the original date with the current date and
 will wait 50ms between events before sending to Kafka, to simulate a real time stream and provide
a nicer visualization. You can override those configurations by changing the constants in the script. 

This script will keep sending data until you click stop or exit the notebook, or until the end of the file is reached.

While the script is running, you can check the data in the table directly at QuestDB's web console at http://localhost:9000 or a live Grafana Dashboard powered by QuestDB at http://localhost:3000/d/trades-crypto-currency/trades-crypto-currency?orgId=1&refresh=250ms (user admin and password quest).


In [3]:
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
import csv
from datetime import datetime
import time
from multiprocessing import Pool


TOTAL_EVENTS = 2000000  # Total number of events to produce
NUM_SENDERS = 5  # Number of senders to execute in parallel
KAFKA_BROKER = 'broker:29092,broker-2:29092'
KAFKA_TOPIC = 'trades'
CSV_FILE = './tradesMarch.csv'
SCHEMA_REGISTRY = 'http://schema_registry:8081'
TIMESTAMP_FROM_FILE = False
VERBOSE = False
DELAY_MS = 50  # Delay between events in milliseconds


def get_delivery_report_func(verbose):
    def delivery_report(err, msg):
        if verbose:
            if err is not None:
                print(f'Message delivery failed: {err}')
            else:
                print(f'Message delivered to {msg.topic()} [{msg.partition()}]')
    return delivery_report

def send(sender_id, total_events):   
    value_schema = avro.loads("""
    {
        "type": "record",
        "name": "Trade",
        "fields": [
            {"name": "symbol", "type": "string"},
            {"name": "side", "type": "string"},
            {"name": "price", "type": "double"},
            {"name": "amount", "type": "double"},
            {"name": "timestamp", "type": "long", "logicalType": "timestamp-micros"}
        ]
    }
    """)

    avro_producer = AvroProducer({
        'bootstrap.servers': KAFKA_BROKER,
        'schema.registry.url': SCHEMA_REGISTRY,
        'linger.ms': '5',  # Adjust based on your needs
        'batch.size': '8388608',  # Adjust based on your needs
        #'compression.type': 'snappy',  # Options: 'gzip', 'snappy', 'lz4', 'zstd',
        'queue.buffering.max.messages': '1000000',  # Increase as needed
        'queue.buffering.max.kbytes': '1048576',    # 1 GB
        'acks': '0',  # '0' for no acks (fastest), '1' for leader ack, 'all' for all replicas

    }, default_value_schema=value_schema)

    delivery_report_func = get_delivery_report_func(VERBOSE)

    events_sent = 0  # Counter to track how many events have been sent

    with open(CSV_FILE, mode='r') as file:
        csv_reader = csv.DictReader(file)
        csv_rows = list(csv_reader)  # Load the CSV data into memory for looping

        while events_sent < total_events:
            message_count = 0
            for row in csv_rows:
                # Check if we have reached the total events
                if events_sent >= total_events:
                    break

                # Handle timestamp either from the file or current time
                if TIMESTAMP_FROM_FILE:
                    timestamp_dt = datetime.strptime(row['timestamp'], "%Y-%m-%dT%H:%M:%S.%fZ")
                    timestamp_micros = int(timestamp_dt.timestamp() * 1e6)
                else:
                    timestamp_micros = int(time.time() * 1e6)

                value = {
                    "symbol": row['symbol'],
                    "side": row['side'],
                    "price": float(row['price']),
                    "amount": float(row['amount']),
                    "timestamp": timestamp_micros
                }

                # Delay between events if needed
                if DELAY_MS > 0:
                    time.sleep(DELAY_MS / 1000.0)  # Convert milliseconds to seconds

                # Send the message to Kafka
                avro_producer.produce(topic=KAFKA_TOPIC, value=value, on_delivery=delivery_report_func)
                message_count += 1
                if message_count % 2000 == 0:
                    avro_producer.poll(0) # Serve delivery callback queue                
                events_sent += 1  # Increment event counter
                
    avro_producer.flush()
    print(f"Sender {sender_id} - Finished sending {events_sent} events.")

def parallel_send(total_events, num_senders: int):
    events_per_sender = total_events // num_senders
    remaining_events = total_events % num_senders

    sender_events = [events_per_sender] * num_senders
    for i in range(remaining_events):  # Distribute the remaining events
        sender_events[i] += 1

    with Pool(processes=num_senders) as pool:
        sender_ids = range(num_senders)
        pool.starmap(send, [(sender_id, sender_events[sender_id]) for sender_id in sender_ids])

if __name__ == '__main__':    
    print(f'Ingestion started.\n')
    parallel_send(TOTAL_EVENTS, NUM_SENDERS)
    




Ingestion started.



Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-5:


KeyboardInterrupt: 

## Verify we have ingested some data

The data you send to Kafka will be processed by Kafka Connect and passed to QuestDB, where it will be stored into a table named `trades`. Let's check we can actually see some data

In [1]:
import requests
import os

HTTP_ENDPOINT = os.getenv('QUESTDB_HTTP_ENDPOINT', 'questdb:9000')
REST_TOKEN = os.getenv('QUESTDB_REST_TOKEN')

if REST_TOKEN is not None:
  host = f'https://admin:quest@{HTTP_ENDPOINT}'
else:
  host = f'http://admin:quest@questdb:9000'

sql_query = 'SELECT * FROM trades LIMIT -5;'

try:
    response = requests.get(
        host + '/exec',
        params={'query': sql_query}, verify=False).json()
    for row in response['dataset']:
        print(row)    
except requests.exceptions.RequestException as e:
    print(f'Error: {e}')

['DOT-USD', 'buy', 8.278547619047, 39.607455338095, '2024-10-28T11:54:19.284340Z']
['DOT-USD', 'buy', 8.278547619047, 39.607455338095, '2024-10-28T11:54:19.285580Z']
['DOT-USD', 'buy', 8.278547619047, 39.607455338095, '2024-10-28T11:56:45.865026Z']
['DOT-USD', 'buy', 8.278547619047, 39.607455338095, '2024-10-28T11:56:46.183816Z']
['DOT-USD', 'buy', 8.278547619047, 39.607455338095, '2024-10-28T11:56:46.360160Z']


