In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [2]:
# 스트림 테이블을 주기적으로 조회하는 함수 (name: 이름, sql: Spark SQL, iterations: 반복횟수, sleep_secs: 인터벌)
def displayStream(name, sql, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)              # 출력 Cell 을 지웁니다
        display('[' + name + '] Iteration: '+str(i)+', Query: '+sql)
        display(spark.sql(sql))              # Spark SQL 을 수행합니다
        sleep(sleep_secs)                    # sleep_secs 초 만큼 대기합니다
        i += 1

# 스트림 쿼리의 상태를 주기적으로 조회하는 함수 (name: 이름, query: Streaming Query, iterations: 반복횟수, sleep_secs: 인터벌)
def displayStatus(name, query, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)      # Output Cell 의 내용을 지웁니다
        display('[' + name + '] Iteration: '+str(i)+', Status: '+query.status['message'])
        display(query.lastProgress)  # 마지막 수행된 쿼리의 상태를 출력합니다
        sleep(sleep_secs)            # 지정된 시간(초)을 대기합니다
        i += 1

In [3]:
json_path = f"{work_data}/korean_movies"
korean_movies = spark.read.option("inferSchema", "true").json(json_path)
display(korean_movies.limit(2))
korean_movies.printSchema()

country,genre,grade,main_actor,movie,rate,time,timestamp,title,year
미국,드라마,12세 관람가,로빈 윌리엄스,10048,10,1132759560,2005-11-24 00:26:00,죽은 시인의 사회,2016
프랑스,드라마,청소년 관람불가,스티브 맥퀸,10046,10,1199927340,2008-01-10 10:09:00,빠삐용,2016


root
 |-- country: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- main_actor: string (nullable = true)
 |-- movie: long (nullable = true)
 |-- rate: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [4]:
korean_movies_schema = korean_movies.schema
movie_reader = (
    spark
    .readStream
    .schema(korean_movies_schema)
    .format("json")
    .option("maxFilesPerTrigger", 1)
    .load(json_path)
)

#### 집계한 결과를 카프카로 전송하는 예제

In [8]:
movie_counter = movie_reader.groupBy("year").count()

In [30]:
# 노트북 로그 콘솔로 출력

table_name = "movie_counter"
movie_writer = (
    movie_counter.selectExpr("cast(year as string) as key", "to_json(struct(*)) as value")
    .writeStream
    .queryName(table_name)
    .format("memory")
    .outputMode("complete")
)

checkpointLocation = f"{work_dir}/tmp/{table_name}"
!rm -rf $checkpointLocation

movie_trigger = (
    movie_writer
    .trigger(processingTime="10 second")
    .option("checkpointLocation", checkpointLocation)
)

movie_query = movie_trigger.start()

# 파이썬의 경우 콘솔 디버깅이 노트북 표준출력으로 나오기 때문에, 별도 메모리 테이블로 조회
displayStream(table_name, f"select * from {table_name} order by key asc limit 10", 30, 10)
movie_query.stop()
print("streaming ended")

'[movie_counter] Iteration: 1, Query: select * from movie_counter order by key asc limit 10'

key,value


KeyboardInterrupt: 

In [31]:
movie_query.stop()

#### 원본 데이터를 카프카로 다시 저장

In [10]:
# 집계의 경우 append 모드로 동작하기 위해서는 time 필드가 포함된 워터마크가 필수입니다
# current_timestamp() as timestamp, 
    
table_name = "korean_movies"
movie_writer = (
    movie_reader.selectExpr("cast(year as string) as key", "to_json(struct(*)) as value")
    .writeStream
    .queryName(table_name)
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9093")
    .option("topic", table_name)
    .outputMode("update")
)

checkpointLocation = f"{work_dir}/tmp/{table_name}"
!rm -rf $checkpointLocation

movie_trigger = (
    movie_writer
    .trigger(processingTime="10 second")
    .option("checkpointLocation", checkpointLocation)
)

movie_query = movie_trigger.start()

displayStatus(table_name, movie_query, 360, 10)
movie_query.stop()
print("streaming ended")

'[korean_movies] Iteration: 44, Status: Waiting for next trigger'

{'id': '03939fef-b752-4521-b3e5-89d75c37ef77',
 'runId': '7e59a015-0dd4-4c34-a81f-4f9c69125014',
 'name': 'korean_movies',
 'timestamp': '2022-07-17T06:39:50.000Z',
 'batchId': 30,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 101, 'triggerExecution': 101},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[file:/home/jovyan/work/data/korean_movies]',
   'startOffset': {'logOffset': 29},
   'endOffset': {'logOffset': 29},
   'latestOffset': None,
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@3568069c',
  'numOutputRows': 0}}

KeyboardInterrupt: 

In [11]:
movie_query.stop()

#### 집계가 아니라 원본 데이터를 그대로 전송
```
root
 |-- country: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- main_actor: string (nullable = true)
 |-- movie: long (nullable = true)
 |-- rate: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 ```

In [5]:
movie_counter = movie_reader.select("*")
movie_counter.printSchema()

root
 |-- country: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- main_actor: string (nullable = true)
 |-- movie: long (nullable = true)
 |-- rate: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [10]:
# 노트북 로그 콘솔로 출력 - 테이블 이름을 변경해 주어야 과거에 존재하는 테이블과 충돌나지 않습니다

table_name = "movie_selector"
movie_writer = (
    movie_counter.selectExpr("cast(year as string) as key", "to_json(struct(*)) as value")
    .writeStream
    .queryName(table_name)
    .format("memory")
    .outputMode("append")
)

checkpointLocation = f"{work_dir}/tmp/{table_name}"
!rm -rf $checkpointLocation

movie_trigger = (
    movie_writer
    .trigger(processingTime="10 second")
    .option("checkpointLocation", checkpointLocation)
)

movie_query = movie_trigger.start()

# 파이썬의 경우 콘솔 디버깅이 노트북 표준출력으로 나오기 때문에, 별도 메모리 테이블로 조회
displayStream(table_name, f"select * from {table_name} limit 10", 30, 10)
movie_query.stop()
print("streaming ended")

'[movie_selector] Iteration: 18, Query: select * from movie_selector limit 10'

key,value
2010,"{""country"":""미국"",""genre"":""드라마"",""grade"":""청소년 관람불가"",""main_actor"":""말론 브란도"",""movie"":10071,""rate"":10,""t..."
1981,"{""country"":""영국"",""genre"":""SF"",""grade"":""15세 관람가"",""main_actor"":""로저 무어"",""movie"":10166,""rate"":10,""time..."
1991,"{""country"":""미국"",""genre"":""판타지"",""grade"":""12세 관람가"",""main_actor"":""크리스티 스완슨"",""movie"":10270,""rate"":10,""..."
1979,"{""country"":""영국"",""genre"":""SF"",""grade"":""PG"",""main_actor"":""말론 브란도"",""movie"":10008,""rate"":10,""time"":10..."
2017,"{""country"":""미국"",""genre"":""멜로/로맨스"",""grade"":""전체 관람가"",""main_actor"":""줄리 앤드류스"",""movie"":10102,""rate"":10,..."
2019,"{""country"":""프랑스"",""genre"":""SF"",""grade"":""15세 관람가"",""main_actor"":""아놀드 슈왈제네거"",""movie"":10200,""rate"":10,..."
1988,"{""country"":""미국"",""genre"":""멜로/로맨스"",""grade"":""청소년 관람불가"",""main_actor"":""브룩 쉴즈"",""movie"":10751,""rate"":9,""..."
2019,"{""country"":""프랑스"",""genre"":""SF"",""grade"":""15세 관람가"",""main_actor"":""아놀드 슈왈제네거"",""movie"":10200,""rate"":10,..."
1986,"{""country"":""미국"",""genre"":""SF"",""grade"":""15세 관람가"",""main_actor"":""시고니 위버"",""movie"":10038,""rate"":10,""tim..."
2010,"{""country"":""미국"",""genre"":""드라마"",""grade"":""청소년 관람불가"",""main_actor"":""알 파치노"",""movie"":10072,""rate"":10,""ti..."


KeyboardInterrupt: 

In [11]:
movie_query.stop()

In [12]:
# 카프카로 다시 저장 - 집계의 경우 append 모드로 동작하기 위해서는 time 필드가 포함된 워터마크가 필수입니다
    
table_name = "korean_movies"
movie_writer = (
    movie_counter.selectExpr("cast(year as string) as key", "to_json(struct(*)) as value")
    .writeStream
    .queryName(table_name)
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9093")
    .option("topic", table_name)
    .outputMode("append")
)

checkpointLocation = f"{work_dir}/tmp/{table_name}"
!rm -rf $checkpointLocation

movie_trigger = (
    movie_writer
    .trigger(processingTime="10 second")
    .option("checkpointLocation", checkpointLocation)
)

movie_query = movie_trigger.start()

displayStatus(table_name, movie_query, 30, 10)
movie_query.stop()
print("streaming ended")

'[korean_movies] Iteration: 30, Status: Waiting for next trigger'

{'id': '1c8febae-b541-4412-84d6-59a25c2c3313',
 'runId': 'a4272295-103b-4ab2-bd6d-ad94028b748a',
 'name': 'korean_movies',
 'timestamp': '2022-07-16T14:08:10.000Z',
 'batchId': 29,
 'numInputRows': 4690,
 'inputRowsPerSecond': 469.04690469046903,
 'processedRowsPerSecond': 8933.333333333332,
 'durationMs': {'addBatch': 215,
  'getBatch': 6,
  'latestOffset': 105,
  'queryPlanning': 9,
  'triggerExecution': 525,
  'walCommit': 67},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[file:/home/jovyan/work/data/korean_movies]',
   'startOffset': {'logOffset': 28},
   'endOffset': {'logOffset': 29},
   'latestOffset': None,
   'numInputRows': 4690,
   'inputRowsPerSecond': 469.04690469046903,
   'processedRowsPerSecond': 8933.333333333332}],
 'sink': {'description': 'org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@1ddef851',
  'numOutputRows': 4690}}

streaming ended
