# Producer (sends Kafka data)

### Get 1% of Data

In [None]:
!aws s3 cp --no-sign-request s3://redshift-downloads/redset/serverless/sample_0.01.parquet .
# Runs about 10 seconds

### Convert Parquet File to CSV because we have to send row by row and sort after date to ensure the right order

In [2]:
import pandas as pd
df = pd.read_parquet('sample_0.01.parquet')

# sort after arrival_timestamp to send in in the right order later
df = df.sort_values(by='arrival_timestamp')
df.to_csv('sample_0.01.csv', index=False)

### Run this to start streaming (will run unlimited and repeat itself after finishing)

In [None]:
from confluent_kafka import Producer
import pandas as pd
import time

# Parameter
speed = 0.1  # batch per second
batch_size = 10  #rows per batch
topic = 'chache-me-if-you-can'
csv_file = 'sample_0.01.csv'

def delivery_report(err, msg):
    if err is not None:
        print(f'Nachricht konnte nicht gesendet werden: {err}')
    else:
        print(f'Nachricht wurde gesendet an: {msg.topic()} - Offset: {msg.offset()}')

def stream_data_to_kafka(csv_file, kafka_config, topic, speed, batch_size):
    # Kafka-Producer erstellen
    producer = Producer(kafka_config)

    try:
        while True:  # Endlosschleife für kontinuierliches Senden
            # CSV-Datei laden
            data = pd.read_csv(csv_file, chunksize=batch_size)

            for chunk in data:
                for _, row in chunk.iterrows():
                    # Zeilenweise JSON-String erstellen
                    message = row.to_json()
                    producer.produce(topic=topic, value=message, callback=delivery_report)

                # Buffer leeren
                producer.flush()

                # Geschwindigkeit kontrollieren
                time.sleep(1 / speed)  # `speed` definiert Nachrichten pro Sekunde

            print("Datei vollständig gesendet. Starte von vorne...")

    except KeyboardInterrupt:
        print("Producer wird beendet...")
    finally:
        producer.flush()

if __name__ == "__main__":
    # Kafka-Konfiguration
    kafka_config = {
        'bootstrap.servers': 'localhost:9092'
    }

    # Parameter
    speed = 100  # Nachrichten pro Sekunde
    batch_size = 10  # Zeilen pro Batch

    # Streaming starten
    stream_data_to_kafka(csv_file, kafka_config, topic, speed, batch_size)
