In [1]:
import os
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

In [2]:
spark_version = os.environ['APACHE_SPARK_VERSION']
os.environ['PYSPARK_SUBMIT_ARGS']="--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0 pyspark-shell"

In [3]:
home_jovyan = "/home/jovyan"

In [4]:
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

In [5]:
spark_version

'3.4.0'

In [6]:
print(os.environ['HOME'])

/home/jovyan


# Spark Session

In [7]:
spark = SparkSession.builder\
.appName("KafkaConsumer")\
.master("spark://spark-master:17077")\
.config("spark.executor.instances","3")\
.config("spark.executor.cores","1")\
.config("spark.executor.memory","4G")\
.config("spark.sql.session.timeZone","Asia/Seoul")\
.getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size


# Kafka setting

In [8]:
# configuration
kafka_config={
    "bootstrap.servers":"kafka:19092",
    "group.id":"seoulcity_consumer_group",
    "topic.name":"seoulcity"
}

In [9]:
kafka_reader = spark.readStream\
.format("kafka")\
.option("kafka.bootstrap.servers",kafka_config["bootstrap.servers"])\
.option("group.id",kafka_config["group.id"])\
.option("subscribe",kafka_config["topic.name"])\
.option("startingOffsets","earliest")\
.load()

# Schema

In [15]:
seoul_citydata_schema = StructType([
    StructField("SeoulRtd.citydata", StructType([
        # StructField("list_total_count", StringType(), nullable=False),
        # StructField("RESULT", StructType([
        #     StructField("RESULT.CODE", StringType(), nullable=False),
        #     StructField("RESULT.MESSAGE", StringType(), nullable=False)
        # ]), nullable=False),
        StructField("CITYDATA", StructType([
            StructField("AREA_NM", StringType(), nullable=False),
            #도로소통정보
            StructField("ROAD_TRAFFIC_STTS", StructType([
                StructField("AVG_ROAD_DATA", StructType([
                    StructField("ROAD_MSG", StringType(), nullable=False),
                    StructField("ROAD_TRAFFIC_IDX", StringType(), nullable=False),
                    StructField("ROAD_TRFFIC_TIME", StringType(), nullable=False),
                    StructField("ROAD_TRAFFIC_SPD", StringType(), nullable=False),
                ]), nullable=False),
                StructField("ROAD_TRAFFIC_STTS", ArrayType(StructType([
                    StructField("LINK_ID", StringType()),
                    StructField("ROAD_NM", StringType()),
                    # StructField("START_ND_CD", StringType()),
                    StructField("START_ND_NM", StringType()),
                    # StructField("START_ND_XY", StringType()),
                    # StructField("END_ND_CD", StringType()),
                    StructField("END_ND_NM", StringType()),
                    # StructField("END_ND_XY", StringType()),
                    StructField("DIST", StringType()),
                    StructField("SPD", StringType()),
                    StructField("IDX", StringType()),
                    StructField("XYLIST", StringType())
                ])), nullable=False)
            ]), nullable=False),
            #주차장
            StructField("PRK_STTS", StructType([
                StructField("PRK_STTS", ArrayType(StructType([
                    StructField("PRK_NM", StringType()),
                    # StructField("PRK_CD", StringType()),
                    StructField("CPCTY", StringType()),
                    StructField("CUR_PRK_CNT", StringType()),
                    StructField("CUR_PRK_TIME", StringType()),
                    StructField("CUR_PRK_YN", StringType()),
                    StructField("PAY_YN", StringType()),
                    StructField("RATES", StringType()),
                    StructField("TIME_RATES", StringType()),
                    StructField("ADD_RATES", StringType()),
                    StructField("ADD_TIME_RATES", StringType()),
                    StructField("ADDRESS", StringType()),
                    StructField("ROAD_ADDR", StringType()),
                    StructField("LNG", StringType()),
                    StructField("LAT", StringType()),
                ])), nullable=False)
            ]), nullable=False),
            #따릉이 
            StructField("SBIKE_STTS", StructType([
                StructField("SBIKE_STTS", ArrayType(StructType([
                    StructField("SBIKE_SPOT_NM", StringType()),
                    # StructField("SBIKE_SPOT_ID", StringType()),
                    StructField("SBIKE_SHARED", StringType()),
                    StructField("SBIKE_PARKING_CNT", StringType()),
                    StructField("SBIKE_RACK_CNT", StringType()),
                    StructField("SBIKE_X", StringType()),
                    StructField("SBIKE_Y", StringType()),
                    ])), nullable=False)
                ]), nullable=False),
            ]), nullable=False)
    ]), nullable=False)
])


In [16]:
kafka_selector= (
    kafka_reader
    .select(
        col("key").cast("string"),
        from_json(col("value").cast("string"),seoul_citydata_schema).alias("seoulcity_rawdata")
    ).selectExpr("seoulcity_rawdata.SeoulRtd.citydata.CITYDATA.AREA_NM as key","seoulcity_rawdata.SeoulRtd.citydata.CITYDATA.ROAD_TRAFFIC_STTS",
                 "seoulcity_rawdata.SeoulRtd.citydata.CITYDATA.PRK_STTS","seoulcity_rawdata.SeoulRtd.citydata.CITYDATA.SBIKE_STTS")
)

AnalysisException: [FIELD_NOT_FOUND] No such struct field `CITYDATA` in `SeoulRtd`.`citydata`.; line 1 pos 0

In [None]:

p_kafka_selector = (
    kafka_selector
    .withColumn("value", to_json(struct("key","ROAD_TRAFFIC_STTS","PRK_STTS","SBIKE_STTS")))
)

In [13]:
def displayStatus(name, query, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)      # Output Cell 의 내용을 지웁니다
        display('[' + name + '] Iteration: '+str(i)+', Status: '+query.status['message'])
        display(query.lastProgress)  # 마지막 수행된 쿼리의 상태를 출력합니다
        sleep(sleep_secs)            # 지정된 시간(초)을 대기합니다
        i += 1

In [14]:
qname = "kafkaQ"
kafkaWriter_origin = (
    p_kafka_selector.select("key", "value")
    .writeStream
    .queryName(qname)
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("topic", "seoulcity_visual")
    .outputMode("append")
)

checkpointLocation = f"{work_dir}/tmp/{qname}"
!rm -rf $checkpointLocation
kafkaTrigger = (
    kafkaWriter_origin
    .trigger(processingTime="5 second")
    .option("checkpointLocation", checkpointLocation)
)

kafkaQuery = kafkaTrigger.start()

displayStatus(qname, kafkaQuery, 1000, 5)
    
kafkaQuery.stop()

'[kafkaQ] Iteration: 97, Status: Waiting for next trigger'

{'id': 'ca8536cc-c7d8-4595-a561-3ba80414502c',
 'runId': 'ab61f22c-9b8b-4612-93b6-53ebf25e58c1',
 'name': 'kafkaQ',
 'timestamp': '2023-05-31T08:02:10.000Z',
 'batchId': 7,
 'numInputRows': 1,
 'inputRowsPerSecond': 0.2,
 'processedRowsPerSecond': 1.1061946902654867,
 'durationMs': {'addBatch': 611,
  'commitOffsets': 136,
  'getBatch': 0,
  'latestOffset': 6,
  'queryPlanning': 15,
  'triggerExecution': 904,
  'walCommit': 134},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[seoulcity]]',
   'startOffset': {'seoulcity': {'2': 458, '1': 466, '0': 470}},
   'endOffset': {'seoulcity': {'2': 458, '1': 466, '0': 471}},
   'latestOffset': {'seoulcity': {'2': 458, '1': 466, '0': 471}},
   'numInputRows': 1,
   'inputRowsPerSecond': 0.2,
   'processedRowsPerSecond': 1.1061946902654867,
   'metrics': {'avgOffsetsBehindLatest': '0.0',
    'maxOffsetsBehindLatest': '0',
    'minOffsetsBehindLatest': '0'}}],
 'sink': {'description': 'org.apache.spark.sql.kafka010.KafkaSour

KeyboardInterrupt: 