In [1]:
# 코어 스파크 라이브러리를 임포트 합니다
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

spark = (
    SparkSession
    .builder
    .appName("Data Engineer Training Course")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
from IPython.display import display, display_pretty, clear_output, JSON
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [14]:
streamLogs = (
    spark
    .read
    .option("inferSchema", "true")
    .json("/fluentd/target/lambda/batch")
).withColumn("mod_id", expr("id % 10"))
streamLogs.printSchema()
streamLogs.show(truncate=False)

root
 |-- hello: string (nullable = true)
 |-- id: long (nullable = true)
 |-- time: string (nullable = true)
 |-- mod_id: long (nullable = true)

+---------+---+-------------------+------+
|hello    |id |time               |mod_id|
+---------+---+-------------------+------+
|ssm-seoul|100|2022-09-30 20:59:19|0     |
|ssm-seoul|101|2022-09-30 20:59:20|1     |
|ssm-seoul|102|2022-09-30 20:59:21|2     |
|ssm-seoul|103|2022-09-30 20:59:22|3     |
|ssm-seoul|104|2022-09-30 20:59:23|4     |
|ssm-seoul|105|2022-09-30 20:59:24|5     |
|ssm-seoul|106|2022-09-30 20:59:25|6     |
|ssm-seoul|107|2022-09-30 20:59:26|7     |
|ssm-seoul|108|2022-09-30 20:59:27|8     |
|ssm-seoul|109|2022-09-30 20:59:28|9     |
|ssm-seoul|110|2022-09-30 20:59:29|0     |
|ssm-seoul|111|2022-09-30 20:59:30|1     |
|ssm-seoul|112|2022-09-30 20:59:31|2     |
|ssm-seoul|113|2022-09-30 20:59:32|3     |
|ssm-seoul|114|2022-09-30 20:59:33|4     |
|ssm-seoul|115|2022-09-30 20:59:34|5     |
|ssm-seoul|116|2022-09-30 20:59:35|6

In [10]:
namePath = f"{work_dir}/data/names"
nameStatic = (
    spark
    .read
    .option("inferSchema", "true")
    .json(namePath)
)
nameStatic.printSchema()
nameStatic.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- uid: long (nullable = true)

+-----+---+
|name |uid|
+-----+---+
|zero |0  |
|one  |1  |
|two  |2  |
|three|3  |
|four |4  |
|five |5  |
+-----+---+



In [15]:
expression = streamLogs.mod_id == nameStatic.uid
staticJoin = streamLogs.join(nameStatic, expression, "leftOuter")
staticJoin.printSchema()
staticJoin.show(10, truncate=False)

root
 |-- hello: string (nullable = true)
 |-- id: long (nullable = true)
 |-- time: string (nullable = true)
 |-- mod_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- uid: long (nullable = true)

+---------+---+-------------------+------+-----+----+
|hello    |id |time               |mod_id|name |uid |
+---------+---+-------------------+------+-----+----+
|ssm-seoul|100|2022-09-30 20:59:19|0     |zero |0   |
|ssm-seoul|101|2022-09-30 20:59:20|1     |one  |1   |
|ssm-seoul|102|2022-09-30 20:59:21|2     |two  |2   |
|ssm-seoul|103|2022-09-30 20:59:22|3     |three|3   |
|ssm-seoul|104|2022-09-30 20:59:23|4     |four |4   |
|ssm-seoul|105|2022-09-30 20:59:24|5     |five |5   |
|ssm-seoul|106|2022-09-30 20:59:25|6     |null |null|
|ssm-seoul|107|2022-09-30 20:59:26|7     |null |null|
|ssm-seoul|108|2022-09-30 20:59:27|8     |null |null|
|ssm-seoul|109|2022-09-30 20:59:28|9     |null |null|
+---------+---+-------------------+------+-----+----+
only showing top 10 rows



In [25]:
# "time", "id as user_id", "name as user_name", "hello", "uid"

staticResult = (
    staticJoin
    .withColumnRenamed("id", "user_id")
    .withColumnRenamed("name", "user_name")
    .drop("mod_id")
)
staticResult.show(truncate=False)

+---------+-------+-------------------+---------+----+
|hello    |user_id|time               |user_name|uid |
+---------+-------+-------------------+---------+----+
|ssm-seoul|100    |2022-09-30 20:59:19|zero     |0   |
|ssm-seoul|101    |2022-09-30 20:59:20|one      |1   |
|ssm-seoul|102    |2022-09-30 20:59:21|two      |2   |
|ssm-seoul|103    |2022-09-30 20:59:22|three    |3   |
|ssm-seoul|104    |2022-09-30 20:59:23|four     |4   |
|ssm-seoul|105    |2022-09-30 20:59:24|five     |5   |
|ssm-seoul|106    |2022-09-30 20:59:25|null     |null|
|ssm-seoul|107    |2022-09-30 20:59:26|null     |null|
|ssm-seoul|108    |2022-09-30 20:59:27|null     |null|
|ssm-seoul|109    |2022-09-30 20:59:28|null     |null|
|ssm-seoul|110    |2022-09-30 20:59:29|zero     |0   |
|ssm-seoul|111    |2022-09-30 20:59:30|one      |1   |
|ssm-seoul|112    |2022-09-30 20:59:31|two      |2   |
|ssm-seoul|113    |2022-09-30 20:59:32|three    |3   |
|ssm-seoul|114    |2022-09-30 20:59:33|four     |4   |
|ssm-seoul

In [26]:
staticResult.write.mode("overwrite").json(f"{work_dir}/tmp/output")