In [1]:
import os
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark_version = os.environ['APACHE_SPARK_VERSION']
os.environ['PYSPARK_SUBMIT_ARGS']="--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0 pyspark-shell"

In [3]:
pwd

'/home/jovyan/work'

In [4]:
spark_version

'3.4.0'

In [5]:
print(os.environ['HOME'])

/home/jovyan


# Spark Session

In [6]:
spark = SparkSession.builder\
.appName("KafkaConsumer")\
.master("spark://spark-master:17077")\
.config("spark.executor.instances","3")\
.config("spark.executor.cores","1")\
.config("spark.executor.memory","4G")\
.config("spark.sql.session.timeZone","Asia/Seoul")\
.getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size


# Kafka setting

In [7]:
# configuration
kafka_config={
    "bootstrap.servers":"kafka:19092",
    "group.id":"seoulcity_consumer_group",
    "topic.name":"seoulcity"
}

In [8]:
kafka_reader = spark.readStream\
.format("kafka")\
.option("kafka.bootstrap.servers",kafka_config["bootstrap.servers"])\
.option("group.id",kafka_config["group.id"])\
.option("subscribe",kafka_config["topic.name"])\
.option("startingOffsets","earliest")\
.load()

# Schema

In [9]:
seoul_citydata_schema = StructType([
    StructField("list_total_count", StringType(), nullable=False),
    StructField("RESULT", StructType([
        StructField("RESULT.CODE", StringType(), nullable=False),
        StructField("RESULT.MESSAGE", StringType(), nullable=False)
    ]), nullable=False),
    StructField("CITYDATA", StructType([
        StructField("AREA_NM", StringType(), nullable=False),
        #도로소통정보
        StructField("ROAD_TRAFFIC_STTS", StructType([
            StructField("AVG_ROAD_DATA", StructType([
                StructField("ROAD_MSG", StringType(), nullable=False),
                StructField("ROAD_TRAFFIC_IDX", StringType(), nullable=False),
                StructField("ROAD_TRFFIC_TIME", StringType(), nullable=False),
                StructField("ROAD_TRAFFIC_SPD", StringType(), nullable=False),
            ]), nullable=False),
            StructField("ROAD_TRAFFIC_STTS", ArrayType(StructType([
                StructField("LINK_ID", StringType()),
                StructField("ROAD_NM", StringType()),
                StructField("START_ND_CD", StringType()),
                StructField("START_ND_NM", StringType()),
                StructField("START_ND_XY", StringType()),
                StructField("END_ND_CD", StringType()),
                StructField("END_ND_NM", StringType()),
                StructField("END_ND_XY", StringType()),
                StructField("DIST", StringType()),
                StructField("SPD", StringType()),
                StructField("IDX", StringType()),
                StructField("XYLIST", StringType())
            ])), nullable=False)
        ]), nullable=False),
        #주차장
        StructField("PRK_STTS", StructType([
            StructField("PRK_STTS", ArrayType(StructType([
                StructField("PRK_NM", StringType()),
                StructField("PRK_CD", StringType()),
                StructField("CPCTY", StringType()),
                StructField("CUR_PRK_CNT", StringType()),
                StructField("CUR_PRK_TIME", StringType()),
                StructField("CUR_PRK_YN", StringType()),
                StructField("PAY_YN", StringType()),
                StructField("RATES", StringType()),
                StructField("TIME_RATES", StringType()),
                StructField("ADD_RATES", StringType()),
                StructField("ADD_TIME_RATES", StringType()),
                StructField("ADDRESS", StringType()),
                StructField("ROAD_ADDR", StringType()),
                StructField("LNG", StringType()),
                StructField("LAT", StringType()),
            ])), nullable=False)
        ]), nullable=False),
        #따릉이 
        StructField("SBIKE_STTS", StructType([
            StructField("SBIKE_STTS", ArrayType(StructType([
                StructField("SBIKE_SPOT_NM", StringType()),
                StructField("SBIKE_SPOT_ID", StringType()),
                StructField("SBIKE_SHARED", StringType()),
                StructField("SBIKE_PARKING_CNT", StringType()),
                StructField("SBIKE_RACK_CNT", StringType()),
                StructField("SBIKE_X", StringType()),
                StructField("SBIKE_Y", StringType()),
            ])), nullable=False)
        ]), nullable=False),
    ]), nullable=False)
])


In [10]:
kafka_selector= (
    kafka_reader
    .select(
        col("key").cast("string"),
        from_json(col("value").cast("string"),seoul_citydata_schema).alias("seoulcity_rawdata")
    ).selectExpr("seoulcity_rawdata.CITYDATA.AREA_NM as key","seoulcity_rawdata.CITYDATA.ROAD_TRAFFIC_STTS",
                 "seoulcity_rawdata.CITYDATA.PRK_STTS","seoulcity_rawdata.CITYDATA.SBIKE_STTS")
)

In [None]:
def 