# Data streaming pipeline with Kafka for livetolldata

In [5]:
import boto3
import json
from datetime import datetime
from kafka import KafkaConsumer
import mysql.connector
import pandas as pd

In [2]:
DATA_PATH = "./data"

In [3]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [4]:
creds = get_secret("wysde")
USERNAME = creds["RDS_MYSQL_USERNAME"]
PASSWORD = creds["RDS_MYSQL_PASSWORD"]
HOST = creds["RDS_MYSQL_HOST"]

TOPIC = 'toll'
DATABASE = 'tolldata'

In [None]:
# Creating database and schema (use terminal if your notebook do not support input)
!mysql --host=$HOST --user=$USERNAME --port=3306 --password < data/schema.sql

In [None]:
# Starting Kafka service (use terminal)
!zookeeper-server-start.sh ~/kafka_2.12-3.2.0/config/zookeeper.properties

!kafka-server-start.sh ~/kafka_2.12-3.2.0/config/server.properties

In [9]:
# Create a topic named toll
!kafka-topics.sh --create --topic toll --bootstrap-server localhost:9092

Created topic toll.


In [None]:
# Execute Python producer and consumer scripts on a new terminal
!python src/producer.py

## Consumer

Streaming data consumer

In [12]:
print("Connecting to the database")
try:
    connection = mysql.connector.connect(host=HOST, database=DATABASE, user=USERNAME, password=PASSWORD)
except Exception:
    print("Could not connect to database. Please check credentials")
else:
    print("Connected to database")
cursor = connection.cursor()

Connecting to the database
Connected to database


In [13]:
print("Connecting to Kafka")
consumer = KafkaConsumer(TOPIC)
print("Connected to Kafka")

Connecting to Kafka
Connected to Kafka


In [15]:
print(f"Reading messages from the topic {TOPIC}")

for msg in consumer:
    # Extract information from kafka
    message = msg.value.decode("utf-8")
    
    # Split message into several columns
    (timestamp, vehcile_id, vehicle_type, plaza_id) = message.split(",")

    # Transform the date format to suit the database schema
    dateobj = datetime.strptime(timestamp, '%a %b %d %H:%M:%S %Y')
    timestamp = dateobj.strftime("%Y-%m-%d %H:%M:%S")

    # Loading data into the mysql database table
    sql = "insert into livetolldata values(%s,%s,%s,%s)"
    result = cursor.execute(sql, (timestamp, vehcile_id, vehicle_type, plaza_id))
    print(f"A {vehicle_type} was inserted into the database")
    connection.commit()

Reading messages from the topic toll
A car was inserted into the database
A car was inserted into the database
A car was inserted into the database
A car was inserted into the database
A car was inserted into the database
A car was inserted into the database
A car was inserted into the database


In [None]:
connection.close()

In [7]:
# Verifying if data was loaded successfully on a new terminal
connection_string = "mysql+pymysql://%s:%s@%s:%s/%s" % (USERNAME, PASSWORD, HOST, "3306", DATABASE)

conn = mysql.connector.connect(host=HOST, user=USERNAME, passwd=PASSWORD, db=DATABASE, charset='utf8mb4')

pd.read_sql_query("""select * from livetolldata limit 10""", conn)

Unnamed: 0,timestamp,vehicle_id,vehicle_type,toll_plaza_id
0,2022-12-18 00:20:11,2514976,car,4001
1,2022-12-18 00:20:13,6995343,car,4006
2,2022-12-18 00:20:14,2462246,car,4006
3,2022-12-18 00:20:15,4392503,car,4010
4,2022-12-18 00:20:16,5229321,car,4003
5,2022-12-18 00:20:18,1181848,car,4004
6,2022-12-18 00:21:00,1649677,car,4003
7,2022-12-18 00:21:01,9259551,car,4007
8,2022-12-18 00:21:01,6624235,car,4005
9,2022-12-18 00:21:01,4604311,car,4009


In [None]:
print("Successfully completed Data Pipeline")