In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")

# 현재 기동된 스파크 애플리케이션의 포트를 확인하기 위해 스파크 정보를 출력합니다
spark

21/10/26 13:55:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/26 13:55:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
queryName = "staticJoin"
checkpointLocation = f"{work_dir}/tmp/{queryName}"
!rm -rf $checkpointLocation

userPath = f"{work_dir}/data/user-static"
userStatic = spark.read.json(userPath) # .where(expr("age > 10")) 이러한 필터 조건이 들어가면 정적 테이블 조인이 안되는 현상 발생
userStatic.printSchema()
userStatic.show()

logsPath = f"{work_dir}/data/tumbling-stream"
logsStream = spark.read.json(logsPath)
logsStream.printSchema()
logsStream.show()
logsSchema = (
    StructType()
    .add(StructField("emp_id", LongType()))
    .add(StructField("emp_name", StringType()))
    .add(StructField("time", StringType()))
    .add(StructField("timestamp", LongType()))
)
logsStream = spark.readStream.format("json").schema(logsSchema).load(logsPath)

joinExpression = (logsStream.emp_id == userStatic.user_id)
staticSelector = logsStream.join(userStatic, joinExpression, "leftOuter")
staticWriter = staticSelector.writeStream.format("console").outputMode("append")
staticTrigger = staticWriter.trigger(processingTime="5 second").option("checkpointLocation", checkpointLocation)

staticQuery = staticTrigger.start()
staticQuery.awaitTermination(10)
staticQuery.stop()

                                                                                

root
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- reg_dt: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_name: string (nullable = true)

+---+------+----------------------+-------+----------+
|age|gender|                reg_dt|user_id| user_name|
+---+------+----------------------+-------+----------+
| 10|  남성|2014. 1. 10. 오후 1...|      1|    김엘지|
| 11|  여성|2015. 3. 2. 오후 12...|      2|    박전자|
| 12|  남성|2018. 7. 30. 오전 0...|      3|  이데이터|
| 10|  여성|2013. 3. 18. 오후 1...|      4|홍엔지니어|
| 20|  남성|2014. 1. 10. 오후 1...|      1|    김엘지|
| 21|  여성|2015. 3. 2. 오후 12...|      2|    박전자|
| 22|  남성|2018. 7. 30. 오전 0...|      3|  이데이터|
| 20|  여성|2013. 3. 18. 오후 1...|      4|홍엔지니어|
+---+------+----------------------+-------+----------+

root
 |-- emp_id: long (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- time: string (nullable = true)
 |-- timestamp: long (nullable = true)

+------+----------------+----------------------+---