In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON
import os

In [2]:
home="/home/jovyan"
data=f"{home}/work/data"
work_dir=!pwd


In [3]:
spark_version = os.environ['APACHE_SPARK_VERSION']
os.environ['PYSPARK_SUBMIT_ARGS']="--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0,org.mongodb.spark:mongo-spark-connector_2.12:3.0.1 pyspark-shell"

In [4]:
work_dir=work_dir[0]

In [5]:
mongo_conn = "mongodb://show:gkdldhdl@mongodb:27017"

In [6]:
spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config('spark.mongodb.input.uri', 'mongodb://show:gkdldhdl@mongodb:27017/seoulcity.test')
    .config('spark.mongodb.output.uri', 'mongodb://show:gkdldhdl@mongodb:27017/seoulcity.test')
    .getOrCreate()
)

In [7]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size


In [8]:
spark

In [9]:
# configuration
kafka_config={
    "bootstrap.servers":"kafka:19092",
    "group.id":"seoulcity_to_mongo",
    "topic.name":"seoulcity"
}

In [10]:
kafka_reader = spark.readStream\
.format("kafka")\
.option("kafka.bootstrap.servers",kafka_config["bootstrap.servers"])\
.option("group.id",kafka_config["group.id"])\
.option("subscribe",kafka_config["topic.name"])\
.option("startingOffsets","earliest")\
.load()

In [11]:
kafka_reader.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



# Schema

In [12]:
seoulcity_schema = StructType([
    StructField("AREA_NM",StringType()),
    StructField("ROAD_TRAFFIC_STTS",StructType([
        StructField("AVG_ROAD_DATA",StructType([
            StructField("ROAD_MSG", StringType()),
            StructField("ROAD_TRAFFIC_IDX", StringType()),
            StructField("ROAD_TRFFIC_TIME", StringType()),
            StructField("ROAD_TRAFFIC_SPD", StringType())
        ]), nullable=False),
        StructField("ROAD_TRAFFIC_STTS",ArrayType(StructType([
            StructField("LINK_ID", StringType()),
            StructField("ROAD_NM", StringType()),
            StructField("START_ND_CD", StringType()),
            StructField("START_ND_NM", StringType()),
            StructField("START_ND_XY", StringType()),
            StructField("END_ND_CD", StringType()),
            StructField("END_ND_NM", StringType()),
            StructField("END_ND_XY", StringType()),
            StructField("DIST", StringType()),
            StructField("SPD", StringType()),
            StructField("IDX", StringType()),
            StructField("XYLIST", StringType())
        ])), nullable=False),
    ]),nullable=False),
    StructField("PRK_STTS",StructType([
        StructField("PRK_STTS",ArrayType(StructType([
            StructField("PRK_NM", StringType()),
            StructField("PRK_CD", StringType()),
            StructField("CPCTY", StringType()),
            StructField("CUR_PRK_CNT", StringType()),
            StructField("CUR_PRK_TIME", StringType()),
            StructField("CUR_PRK_YN", StringType()),
            StructField("PAY_YN", StringType()),
            StructField("RATES", StringType()),
            StructField("TIME_RATES", StringType()),
            StructField("ADD_RATES", StringType()),
            StructField("ADD_TIME_RATES", StringType()),
            StructField("ADDRESS", StringType()),
            StructField("ROAD_ADDR", StringType()),
            StructField("LNG", StringType()),
            StructField("LAT", StringType())
        ])), nullable=False),
    ]),nullable=False),
     StructField("SBIKE_STTS",StructType([
        StructField("SBIKE_STTS",ArrayType(StructType([
            StructField("SBIKE_SPOT_NM", StringType()),
            StructField("SBIKE_SPOT_ID", StringType()),
            StructField("SBIKE_SHARED", StringType()),
            StructField("SBIKE_PARKING_CNT", StringType()),
            StructField("SBIKE_RACK_CNT", StringType()),
            StructField("SBIKE_X", StringType()),
            StructField("SBIKE_Y", StringType())
        ])), nullable=False),
    ]),nullable=False)
])

In [13]:
kafka_selector= (
    kafka_reader
    .select(
        col("key").cast("string"),
        from_json(col("value").cast("string"),seoulcity_schema).alias("seoulcitydata")
    )
    .selectExpr("concat(seoulcitydata.AREA_NM,seoulcitydata.ROAD_TRAFFIC_STTS.AVG_ROAD_DATA.ROAD_TRFFIC_TIME) as key",
                "seoulcitydata.AREA_NM",
                "seoulcitydata.ROAD_TRAFFIC_STTS.AVG_ROAD_DATA", 
                "seoulcitydata.ROAD_TRAFFIC_STTS.ROAD_TRAFFIC_STTS", 
                "seoulcitydata.PRK_STTS.PRK_STTS",
                "seoulcitydata.SBIKE_STTS.SBIKE_STTS"
               )
)


In [14]:
def write_row(batch_df , batch_id):
    clear_output(wait=True)
    display(batch_id, batch_df)
    batch_df.write.format("mongo").mode("append").save()
    pass

In [15]:
# .option("uri", "mongodb://show:gkdldhdl@mongodb:27017/")
#     .option("database","seoulcity")
#     .option("collection","test")

#     .option("spark.mongodb.output.uri", "mongodb://show:gkdldhdl@mongodb/seoulcity.test")

In [None]:
kafka_selector.writeStream.foreachBatch(write_row).start().awaitTermination()

35

key,AREA_NM,AVG_ROAD_DATA,ROAD_TRAFFIC_STTS,PRK_STTS,SBIKE_STTS
잠실한강공원2023-06-05 08:45,잠실한강공원,"{해당 장소로 이동·진입하는 도로가 크게 막히지 않아요., 원활, 2023-06-05 08:45, 40}","[{1230003900, 올림픽대로, 1230046600, 잠실여름파출소, 127.0810876316749329_37.5169833092338294, 1230047500, 잠...",,
