# Asignment 2B: Using real-time streaming data to predict pedestrian traffic

**Student Name: Sin Yee Neo**

**Student ID: 31340458**

Date: 10/02/2021

Environment: Python 3.6.0

## 1 Producing the data

Implement Apache Kafka producer to simulate the real-time data transfer from one repository to another. The program will send 1 batch of all sensor's one day worth of records every 5 seconds to the Kafka stream.

### 1.1 Import library

In [None]:
import pandas as pd
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
import csv
from datetime import datetime


### 1.2 Kafka stream producer

In [None]:
# function to get list of dict
def get_list_of_dict(keys, list_of_tuples):
     """
     This function will accept keys and list_of_tuples as args and return list of dicts
     """
     list_of_dict = [dict(zip(keys, values)) for values in list_of_tuples]
     return list_of_dict

# Function to read csv file into dictionary
def read_csv(fileName):
    'Read the CSV file Streaming_Pedestrian_December_counts_per_hour.csv'
    data = []
    included_cols = ['ID', 'Date_Time', 'Year', 'Month', 'Mdate', 'Day', 'Time', 'Sensor_ID', 'Sensor_Name', 'Hourly_Counts']

    with open(fileName) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            content = list(row[i] for i in included_cols)
            data.append(content)
    keys = tuple(included_cols)
    dict_lst = get_list_of_dict(keys, data)
    return dict_lst

# Function to publish the message from the csv file
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

    
if __name__ == '__main__':
   
    topic = 'pedestrain_count'
    cRows = read_csv('Streaming_Pedestrian_December_counts_per_hour.csv')
    
    print('Publishing record/s..')
    producer = connect_kafka_producer()

    # create list of dictionaries with keys date, time1 and am/pm
    date_dct_lst = []
    L = ['date', 'time1', 'am/pm']
    for i in range(len(cRows)):
        res = {key: cRows[i][key] for key in cRows[0].keys() & {'Date_Time'}} # spling datatime into date and time
        split = next(iter(res.values())).split()
        res1 = dict(zip(L, split))
        date_dct_lst.append(res1)
        
    # add L into crows
    for i in range(len(cRows)):
        cRows[i].update(date_dct_lst[i])
    
    # create a list with unique dates
    uniq_date = list(set([d['date'] for d in cRows]))
    uniq_date.sort(key=lambda date: datetime.strptime(date, "%m/%d/%Y"))
    new_date = []
    
    # create list of lists with data with different unique dates
    for dat in uniq_date:
        filter_date = [dic for dic in cRows if dic['date'] == dat]
        for sub in filter_date:
            # without changing datatime to datatime format
            sub['ID'] = int(sub['ID'])
            sub['Year'] = int(sub['Year'])
            sub['Mdate'] = int(sub['Mdate'])
            sub['Time'] = int(sub['Time'])
            sub['Sensor_ID'] = int(sub['Sensor_ID'])
            sub['Hourly_Counts'] = int(sub['Hourly_Counts'])
        new_date.append(filter_date)
        
    # sending data day by day after 5 seconds
    starting = 0
    while True:
        to_send = new_date[starting]
        for j in range(len(to_send)):
            msg = to_send[j]

            publish_message(producer, topic, msg)
            
        starting += 1


        sleep(5)