# 1. Producing the data
In this task, we will implement one Apache Kafka producer to simulate real-time data streaming. Spark is not allowed in this part since it’s simulating a streaming data source.

1.1 Your program should send one batch of click_stream data every 5 seconds. One batch consists of a random 500-1000 rows from the clickstream_rt dataset. The CSV shouldn’t be loaded to memory at once to conserve memory (i.e. Read row as needed).  
1.2 For each row, add an integer column named ‘ts’, a Unix timestamp in seconds since the epoch (UTC timezone). Spead your batch out evenly for 5 seconds.  
For example, if you send a batch of 600 records at 2023-09-01 00:00:00 (ISO format: YYYY-MM-DD HH:MM:SS) -> (ts = 1693526400) :  
Record 1-120: ts = 1693526400  
Record 121-240: ts = 1693526401  
Record 241-360: ts = 1693526402  
….  
1.3 Send your batch to a Kafka topic with an appropriate name.  

All the data except for the ‘ts’ column should be sent in the original String type, without changing to any other types.  


In [None]:
from time import sleep, time
from json import dumps
from kafka3 import KafkaProducer
import random
import datetime as dt
import csv
import os
import math

# configuration
hostip = "118.139.10.179"
topic = "big-data-a2-topic"

In [None]:
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
#         print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))

In [None]:
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=[f'{hostip}:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

In [None]:
def get_position(file_path, max_rows=1000):
    file_size = os.path.getsize(file_path)
    return random.randint(0, file_size - max_rows * average_row_length)

In [None]:
def read_csv_fuc(file_path, min_rows=500, max_rows=1000):
    file_size = os.path.getsize(file_path)
    position = random.randint(0, file_size)
    
    num_rows = random.randint(min_rows, max_rows)
    rows = []

    with open(file_path, 'r') as f:
        f.seek(position)
        f.readline() 
        reader = csv.reader(f)
        for _ in range(num_rows):
            line = next(reader, None)
            if not line:
                f.seek(0)
                reader = csv.reader(f)
                line = next(reader)
            rows.append(line)
    return rows

In [None]:
def add_ts(rows):
    ts = int(time())
    batch = math.ceil(len(rows) / 5)

    for i, row in enumerate(rows):
        ts_increment = i // batch
        row.append(ts + ts_increment)
    return rows

In [None]:
if __name__ == '__main__':
    print('Publishing records..')
    producer = connect_kafka_producer()    
    file_path = "./dataset/click_stream_rt.csv"
    
    while True:
        rows = read_csv_fuc(file_path)
        rows_ts = add_ts(rows)
        
        for row in rows_ts:
            print(row)
            publish_message(producer, topic, row)
        
        sleep(5)