# Flight Producer
The following code reads the data given, and transform it into a format that is readable by `Kafka`, that is why is transformed from csv to list of dictionaries. This information is going to be sent to a topic called `flightTopic` and we are going to divided in a way that each batch have a random number of records using two different timestamps.

In order to use the ML pipeline in the streaming application, we need to wrangle the data deleting all the NA values

In [None]:
# import statements
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
import pandas as pd
import glob
import os


def read_csv(path):
    '''Read the CSV file flights*.csv'''
    columns_to_drop = ['CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
    df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', path))))
    df = df.drop(columns=columns_to_drop)
    df = df.dropna()
    return df

def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
        #print(data)
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    
def getFlightRecords(df, keys):
    """
    Convert a dataframe into a list of lists, where each list is a collection of dictionaries, 
    that store the data of the flights that share the same key (DAY_OF_WEEK). Do not include any NA value
    So we can avoid troubles in the future
    """
    flightRecords = [df[df.DAY_OF_WEEK == i].to_dict('records') for i in keys]
    return flightRecords
    
    
if __name__ == '__main__':
   
    topic = 'flight_Topic'
    print('Publishing records..')
    flightProducer = connect_kafka_producer()
    
    
    keyFlights = list(range(1,8)) # List of all possible keys
    
    flights = read_csv('./flight-delays/flight1.csv') # Read the CSV file
    
    flightRecords = getFlightRecords(flights, keyFlights) # Convert the pandas dataframe into a list of lists by key
    
    start_index=0 # Set start index as 0 (the first element of the list)
    temp_Y = [] # Define temp_Y as the previous value of Y, it starts as an empty list

    while True:
        
        X = [] # Define the batch X as empty list
        Y = [] # Define the batch Y as empty list
        
        for key in keyFlights: # For each key in keyFlights
            
            ts = {'ts': int(dt.datetime.now().timestamp())}  #define the current timestamp
            
            A = random.randrange(70,101) #define the size of the sub-batch A 
            B = random.randrange(5,11) #define the size of the sub-batch B 
            
            # Define the range of the sub-batchs of each key, each sub-batch is not overlapped
            min_index_A, max_index_A = start_index, start_index+A # Define the range of rows of sub-batch A
            min_index_B, max_index_B = start_index+A, start_index+A+B # Define the range of rows of sub-batch B
            
            start_index += A+B # Set a new start_index to make sure we do not repeat records to be streamed
            
            subbatch_Ai = flightRecords[key - 1][min_index_A:max_index_A] # Get the rows that belong to sub-batch Ai
            
            # Add the current time stamp to every record of the sub-batch Ai
            for i in range(len(subbatch_Ai)): 
                subbatch_Ai[i] = dict(**ts, **subbatch_Ai[i])
                        
            subbatch_Bi = flightRecords[key - 1][min_index_B:max_index_B] # Get the rows that belong to sub-batch Bi
            
            # Add the current time stamp to every record of the sub-batch Bi
            for i in range(len(subbatch_Bi)):
                subbatch_Bi[i] = dict(**ts, **subbatch_Bi[i])
            
            X += subbatch_Ai # Create batch X composed by all the Ai concatenated
            Y += subbatch_Bi # Create batch Y composed by all the Bi concatenated
        
        
        
        if len(temp_Y) == 0: # If it is time_0 and there is not temp_Y
            data = X  # send just batch X
        else:
            data = X + temp_Y # Send batch Xi and Yi-1
            
        temp_Y = Y # Create temp_Y or Yi-1 based on the current value of Y
     
        publish_message(flightProducer, topic, data)
        
        #reset to start from begining
        if(start_index>=len(flightRecords)):
            start_index=0
            
        sleep(5) 

Publishing records..
