In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")

# 현재 기동된 스파크 애플리케이션의 포트를 확인하기 위해 스파크 정보를 출력합니다
spark

In [2]:
# 스트림 테이블을 주기적으로 조회하는 함수 (name: 이름, sql: Spark SQL, iterations: 반복횟수, sleep_secs: 인터벌)
def displayStream(name, sql, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)              # 출력 Cell 을 지웁니다
        display('[' + name + '] Iteration: '+str(i)+', Query: '+sql)
        display(spark.sql(sql))              # Spark SQL 을 수행합니다
        sleep(sleep_secs)                    # sleep_secs 초 만큼 대기합니다
        i += 1

# 스트림 쿼리의 상태를 주기적으로 조회하는 함수 (name: 이름, query: Streaming Query, iterations: 반복횟수, sleep_secs: 인터벌)
def displayStatus(name, query, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)      # Output Cell 의 내용을 지웁니다
        display('[' + name + '] Iteration: '+str(i)+', Status: '+query.status['message'])
        display(query.lastProgress)  # 마지막 수행된 쿼리의 상태를 출력합니다
        sleep(sleep_secs)            # 지정된 시간(초)을 대기합니다
        i += 1

In [3]:
kafkaReader = (
    spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "kafka:9093")
  .option("subscribe", "movies")
  .option("startingOffsets", "earliest")
  .load()
)
kafkaReader.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
kafkaSchema = (
    StructType()
    .add(StructField("movie", StringType()))
    .add(StructField("title", StringType()))
    .add(StructField("title_eng", StringType()))
    .add(StructField("year", IntegerType()))
    .add(StructField("grade", StringType()))
    .add(StructField("timestamp", StringType()))
)

kafkaSelector = (
    kafkaReader
    .select(
        col("key").cast("string"),
        from_json(col("value").cast("string"), kafkaSchema).alias("movies")
    )
    .selectExpr("movies.movie as key", "to_json(struct(movies.*)) as value")
)

kafkaSelector.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [6]:
queryName = "consoleSink"
kafkaWriter = (
    kafkaSelector.select("key", "value")
    .writeStream
    .queryName(queryName)
    .format("memory")
    .outputMode("append")
)

In [7]:
checkpointLocation = f"{work_dir}/tmp/{queryName}"
!rm -rf $checkpointLocation

In [8]:
kafkaTrigger = (
    kafkaWriter
    .trigger(processingTime="5 second")
    .option("checkpointLocation", checkpointLocation)
)

# 파이썬의 경우 콘솔 디버깅이 노트북 표준출력으로 나오기 때문에, 별도 메모리 테이블로 조회
kafkaQuery = kafkaTrigger.start()
displayStream(queryName, f"select * from {queryName} order by key desc", 3, 3)
kafkaQuery.stop()

'[consoleSink] Iteration: 3, Query: select * from consoleSink order by key desc'

key,value
10099,"{""movie"":""10099"",""title"":""남과 여"",""title_eng"":""Un Homme Et Une Femme , A Man And A Woman , 1966"",""y..."
10098,"{""movie"":""10098"",""title"":""자유의 댄스"",""title_eng"":""Footloose , 1984"",""year"":0,""grade"":""PG"",""timestamp..."
10097,"{""movie"":""10097"",""title"":""굿모닝 베트남"",""title_eng"":""Good Morning, Vietnam , 1987"",""year"":0,""grade"":""1..."
10096,"{""movie"":""10096"",""title"":""고스트버스터즈 2"",""title_eng"":""Ghostbusters II , 1989"",""year"":1990,""grade"":""PG..."
10095,"{""movie"":""10095"",""title"":""고스트버스터즈"",""title_eng"":""Ghostbusters , 1984"",""year"":1984,""grade"":""12세 관람가..."
10094,"{""movie"":""10094"",""title"":""마이 웨이"",""title_eng"":""The Winners , My Way , 1975"",""year"":1979,""grade"":""N..."
10093,"{""movie"":""10093"",""title"":""러브 스토리"",""title_eng"":""Love Story , 1970"",""year"":1971,""grade"":""15세 관람가"",""..."
10092,"{""movie"":""10092"",""title"":""네버엔딩 스토리"",""title_eng"":""The NeverEnding Story , 1984"",""year"":1988,""grade..."
10091,"{""movie"":""10091"",""title"":""네 멋대로 해라"",""title_eng"":""A Bout De Souffle , Breathless , 1959"",""year"":0,..."
10090,"{""movie"":""10090"",""title"":""챔프"",""title_eng"":""The Champ , 1979"",""year"":1979,""grade"":""NR"",""timestamp""..."
