In [1]:
import json
import uuid

from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin.new_topic import NewTopic
from kafka.errors import TopicAlreadyExistsError

import os
import pandas as pd
import datetime
import threading
from pathlib import Path
import time
import s3fs
import pyarrow.parquet as pq
from collections import namedtuple

In [2]:
config = dict(
    bootstrap_servers=['kafka.kafka.svc.cluster.local:9092'],
    first_name='Reenie',
    last_name='Christudass'
)

config['client_id'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)
config['topic_prefix'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)

print(config)

{'bootstrap_servers': ['kafka.kafka.svc.cluster.local:9092'], 'first_name': 'Reenie', 'last_name': 'Christudass', 'client_id': 'ChristudassReenie', 'topic_prefix': 'ChristudassReenie'}


In [3]:
def create_kafka_topic(topic_name, config=config, num_partitions=1, replication_factor=1):
    bootstrap_servers = config['bootstrap_servers']
    client_id = config['client_id']
    topic_prefix = config['topic_prefix']
    name = '{}-{}'.format(topic_prefix, topic_name)
    
    admin_client = KafkaAdminClient(
        bootstrap_servers=bootstrap_servers, 
        client_id=client_id
    )
    
    topic = NewTopic(
        name=name,
        num_partitions=num_partitions,
        replication_factor=replication_factor
    )

    topic_list = [topic]
    try:
        admin_client.create_topics(new_topics=topic_list)
        print('Created topic "{}"'.format(name))
    except TopicAlreadyExistsError as e:
        print('Topic "{}" already exists'.format(name))

In [4]:
create_kafka_topic('simple')

Topic "ChristudassReenie-simple" already exists


In [5]:
from kafka.admin import KafkaAdminClient, NewTopic

bootstrap_servers = ['kafka.kafka.svc.cluster.local:9092'] 
admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers, client_id='test')

# Get list of all topics
all_topics = admin_client.list_topics()

keyword = 'Reenie'

result = [x for x in all_topics if keyword in x]
print(result)

['ChristudassReenie-accelerations', 'ChristudassReenie-locations', 'ChristudassReenie-simple']


In [6]:
from pathlib import Path
current_path = "dsc650/dsc650/assignments/assignment09"
print(current_path)
current_dir = Path(current_path)
checkpoint_dir = current_dir.joinpath('checkpoints')
locations_checkpoint_dir = checkpoint_dir.joinpath('locations')
accelerations_checkpoint_dir = checkpoint_dir.joinpath('accelerations')

dsc650/dsc650/assignments/assignment09


In [14]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

spark = SparkSession\
    .builder\
    .appName("Assignment09")\
    .getOrCreate()

df_locations = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka.kafka.svc.cluster.local:9092") \
    .option("subscribe", "ChristudassReenie-locations") \
    .load()



In [None]:
loc_query = df_locations.writeStream \
    .format("kafka") \
    .option("checkpointLocation", locations_checkpoint_dir) \
    .option("kafka.bootstrap.servers",  config['bootstrap_servers'][0]) \
    .option("topic", 'ChristudassReenie-simple') \
    .start()

loc_query.awaitTermination()