In [None]:
!pip install dotenv

Collecting dotenv
  Using cached dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.0.1


In [13]:
import time
import json
from kafka3 import KafkaProducer, KafkaConsumer
from datetime import datetime as dt
from pymongo import MongoClient
from pyspark.sql import Row
from dotenv import load_dotenv
import pandas as pd
import os

class kafkaProducer:
    def __init__(self, csv_path: str, kafka_server: str, producer_id: int, topic: str, batch_interval: float = 5):
        """
        Initialize Kafka Producer with all necessary parameters.
        
        Args:
            csv_path (str): Path to the CSV file
            kafka_server (str): Kafka server address or env var name
            producer_id (str): Unique identifier for this producer
            topic (str): Kafka topic to produce to
            batch_interval (float, optional): Time between batches in seconds. Defaults to 5
        """
        # Load configuration from environment variables with fallback to provided values
        self.kafka_server = kafka_server
        self.producer_id = producer_id
        self.topic = topic
        self.batch_interval = batch_interval
        
        # Load data
        try:
            self.df = pd.read_csv(
                csv_path,
                sep=',',
                parse_dates=['timestamp'],
                dtype={'batch_id': int}
            )
            print(f"[INFO] Successfully loaded data from {csv_path}")
        except Exception as e:
            print(f"[ERROR] Failed to load CSV file {csv_path}: {e}")
            raise
        
        # Initialize producer
        try:
            self.producer = KafkaProducer(
                bootstrap_servers=[self.kafka_server],
                api_version=(0, 10),
                value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                key_serializer=lambda k: k.encode('utf-8')
            )
            print(f"print {self.kafka_server}")

        except Exception as e:
            print(f"[ERROR] Failed to connect to Kafka at {self.kafka_server}: {e}")
            raise

    def produce_batches(self) -> None:
        """
        Iterate through each batch_id in order, wrap each record
        as a dict + producer tag, and send to Kafka. Then sleep.
        """
        print(self.kafka_server)
        for batch_id in sorted(self.df['batch_id'].unique()):
            batch_df = self.df[self.df['batch_id'] == batch_id]
            print(f"[INFO] Publishing batch #{batch_id} ({len(batch_df)} records)...")
            
            for _, row in batch_df.iterrows():
                event = row.to_dict()
                # Ensure timestamp is serializable
                event['timestamp'] = event['timestamp'].isoformat()
                # Tag with producer identity
                event['producer_id'] = self.producer_id
                # Record when the batch is sent exactly
                event['sent_at'] = dt.now().isoformat()

                try:
                    self.producer.send(
                        self.topic, 
                        key=event['car_plate'],
                        value=event,
                        timestamp_ms=int(time.time()*1000)
                    )
                except Exception as e:
                    print(f"[WARN] Failed to send event {event['event_id']}: {e}")

            # Force all buffered messages out
            self.producer.flush()
            print(f"[INFO] Batch #{batch_id} sent. Sleeping {self.batch_interval}s...")
            print(f"[DATA] Batch #{batch_id}:\n{batch_df}\n")
            time.sleep(self.batch_interval)
            
        # Loop has finished
        self.producer.flush()
        self.producer.close()
        print("[INFO] Producer finished and closed")

In [None]:
csv_path="data/camera_event_a.csv"
kafka_server="172.17.0.1:9092"
producer_id="producer_a"
topic="camera_event_a"
batch_interval=5

producer = kafkaProducer(csv_path, kafka_server, producer_id, topic, batch_interval)
producer.produce_batches()

[INFO] Successfully loaded data from data/camera_event_a.csv
print 172.17.0.1:9092
172.17.0.1:9092
[INFO] Publishing batch #1 (20 records)...
[INFO] Batch #1 sent. Sleeping 5s...
[DATA] Batch #1:
                                event_id  batch_id car_plate  camera_id  \
0   d40c586c-5be6-4743-a1e3-2269d9edaa72         1     KRN 7          1   
1   85c08e3c-a0b5-45d8-a70c-df8f9a6d5829         1     ICE 8          1   
2   f5834b79-771b-4931-8da2-a5ad7f4ccd02         1   QE 1820          1   
3   d0e547bb-c4a7-4750-b7b4-8076e9b47f4f         1   CJW 924          1   
4   f3162606-1b2e-407f-951d-61d14c0a7b09         1   CJP 278          1   
5   c69852f7-cd9a-4892-b225-8f8a36ec017b         1    ZPG 90          1   
6   5113d990-4da6-41d2-b292-2516e7d8dc07         1    KKE 15          1   
7   67ce0672-95b5-43bd-9940-2e16db5f747a         1     WJX 2          1   
8   7c5c112b-7521-4bbc-90f8-3c3a85cc3fe1         1     FO 32          1   
9   213f0627-2be4-409c-972b-669627ca0461         1    