## Q1

In [2]:
!docker compose exec redpanda-1 rpk --version

rpk version v24.2.18 (rev f9a22d4430)


## Q2

In [9]:
!docker compose exec redpanda-1 rpk topic create green-trips

TOPIC        STATUS
green-trips  OK


## Q3

In [56]:
import json

from kafka import KafkaProducer

def json_serializer(data):
    return json.dumps(data).encode('utf-8')

server = 'localhost:9092'

producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer
)

producer.bootstrap_connected()

True

## Q4

In [4]:
from time import time
import json
import pandas as pd
from kafka import KafkaProducer

def json_serializer(data):
    for key, value in data.items():
        if pd.isna(value):
            data[key] = None
        elif isinstance(value, pd.Timestamp):
            data[key] = value.strftime('%Y-%m-%d %H:%M:%S')
    return json.dumps(data).encode('utf-8')
    
required_columns = [
    "lpep_pickup_datetime",
    "lpep_dropoff_datetime",
    "PULocationID",
    "DOLocationID",
    "passenger_count",
    "trip_distance",
    "tip_amount"
]

# Read the CSV file into a DataFrame
csv_file_path = "data/green_tripdata_2019-10.csv.gz"

df = pd.read_csv(
    csv_file_path,
    usecols=required_columns,
    compression="gzip",
    parse_dates=['lpep_pickup_datetime', 'lpep_dropoff_datetime']
)

topic_name = "green-trips"

# Kafka producer
producer = KafkaProducer(
    bootstrap_servers="localhost:9092",
    value_serializer=json_serializer
)

t0 = time()

try:
    dict_rows = df.to_dict(orient="records")
    # Send each row (message) to the topic
    for row in dict_rows:
        # Send the message to the topic
        producer.send(topic_name, value=row)
        
    # Flush the producer to ensure all messages are sent
    producer.flush()
    print("All messages sent successfully!")

except Exception as e:
    print(f"Error sending messages: {e}")

finally:
    # Close the producer (optional)
    producer.close()
    
t1 = time()
took = t1 - t0

print(f"Total time taken: {took:.2f} seconds")

All messages sent successfully!
Total time taken: 80.88 seconds


## Q5

For this question:
1. Messages should be stored in Kafka topic (Q4)

2. Run Apache Flink job (session_job.py)
    - `make session_job`

3. Query your postgres database:

    - I used Cloudbeaver running on docker. 

    ```sql
    SELECT
      pulocationid AS pickup_location_id
      ,dolocationid AS dropoff_location_id
      ,streak_count
      ,session_start
      ,session_end
      ,(session_end - session_start) session_duration
    FROM
      location_streaks ls
    ORDER BY streak_count DESC
    LIMIT 1
    ```

Expected output:

| pickup_location_id | dropoff_location_id | streak_count | session_start | session_end | session_duration |
|---|---|---|---|---|---|
| 95 | 95 |	44	| 2019-10-16 18:18:42.000	| 2019-10-16 19:26:16.000	| 01:07:34 |