In [1]:
import json
from kafka import KafkaProducer
import pandas as pd
from time import time
import numpy as np

In [2]:
def json_serializer(data):
    return json.dumps(data).encode("utf-8")

In [3]:
server = "localhost:9092"
topic_name = "green-trips"

In [4]:
producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer,
)

In [5]:
producer.bootstrap_connected()

True

In [6]:
data_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz"
cols = [
    "lpep_pickup_datetime",
    "lpep_dropoff_datetime",
    "PULocationID",
    "DOLocationID",
    "passenger_count",
    "trip_distance",
    "tip_amount",
]
dtypes = {
    "PULocationID": "Int64",
    "DOLocationID": "Int64",
    "passenger_count": "Int64",
    "trip_distance": "Float64",
    "tip_amount": "Float64",
}
parse_dates = ["lpep_pickup_datetime", "lpep_dropoff_datetime"]
df = pd.read_csv(data_url, usecols=cols, dtype=dtypes, parse_dates=parse_dates)
df.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,tip_amount
0,2019-10-01 00:26:02,2019-10-01 00:39:58,112,196,1,5.88,0.0
1,2019-10-01 00:18:11,2019-10-01 00:22:38,43,263,1,0.8,0.0
2,2019-10-01 00:09:31,2019-10-01 00:24:47,255,228,2,7.5,0.0
3,2019-10-01 00:37:40,2019-10-01 00:41:49,181,181,1,0.9,0.0
4,2019-10-01 00:08:13,2019-10-01 00:17:56,97,188,1,2.52,2.26


In [7]:
# Convert Timestamp columns to strings for JSON serialization
date_fmt = "%Y-%m-%d %H:%M:%S"
df["lpep_pickup_datetime"] = df["lpep_pickup_datetime"].dt.strftime(date_fmt)
df["lpep_dropoff_datetime"] = df["lpep_dropoff_datetime"].dt.strftime(date_fmt)
df.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,tip_amount
0,2019-10-01 00:26:02,2019-10-01 00:39:58,112,196,1,5.88,0.0
1,2019-10-01 00:18:11,2019-10-01 00:22:38,43,263,1,0.8,0.0
2,2019-10-01 00:09:31,2019-10-01 00:24:47,255,228,2,7.5,0.0
3,2019-10-01 00:37:40,2019-10-01 00:41:49,181,181,1,0.9,0.0
4,2019-10-01 00:08:13,2019-10-01 00:17:56,97,188,1,2.52,2.26


In [8]:
# Convert NAs to None for JSON serialization
df = df.fillna(value=np.nan).replace([np.nan], [None])
df.tail()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,tip_amount
476381,2019-10-31 23:30:00,2019-11-01 00:00:00,65,102,,7.04,0.0
476382,2019-10-31 23:03:00,2019-10-31 23:24:00,129,136,,0.0,0.0
476383,2019-10-31 23:02:00,2019-10-31 23:23:00,61,222,,3.9,0.0
476384,2019-10-31 23:42:00,2019-10-31 23:56:00,76,39,,3.08,0.0
476385,2019-10-31 23:23:00,2019-10-31 23:56:00,56,215,,6.84,0.0


In [14]:
rows = df.to_dict(orient="records")
t0 = time()
# TODO: Update this
for i, message in enumerate(rows[:100]):
    if i % 100000 == 0:
        print(f"Sending row #{i} with message:\n{message}\n\n")
    try:
        producer.send(topic_name, value=message)
    except Exception as e:
        print(f"Failed to write to kafka with error {e} and data {message}")

producer.flush()

t1 = time()
took = t1 - t0
print(f"Time to send the entire dataset and flush: {took:.02f}")

Sending row #0 with message:
{'lpep_pickup_datetime': '2019-10-01 00:26:02', 'lpep_dropoff_datetime': '2019-10-01 00:39:58', 'PULocationID': 112, 'DOLocationID': 196, 'passenger_count': 1, 'trip_distance': 5.88, 'tip_amount': 0.0}


Time to send the entire dataset and flush: 0.01


In [None]:
df.shape