In [1]:
# 여기에 실습 코드를 작성하고 실행하세요 (Shift+Enter)
# 코어 스파크 라이브러리를 임포트 합니다
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession
    .builder
    .appName("Data Engineer Training Course")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
from IPython.display import display, display_pretty, clear_output, JSON
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size
spark

In [4]:
df = spark.range(0, 10).withColumnRenamed("id", "num")
df.show(1, False)
spark.sql("select current_date() as today, current_timestamp() as now").createOrReplaceTempView("current_time")

+---+
|num|
+---+
|0  |
+---+
only showing top 1 row



In [5]:
spark.sql("""
show tables
""").show(10, False)

+---------+------------+-----------+
|namespace|tableName   |isTemporary|
+---------+------------+-----------+
|         |current_time|true       |
+---------+------------+-----------+



#### The IDENTIFER Clause

In [6]:
df = spark.sql(
    "select * from IDENTIFIER(:tbl)",
    args = { "tbl": "current_time" }
)
df.printSchema()
df.show(1, False)

root
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)

+----------+--------------------------+
|today     |now                       |
+----------+--------------------------+
|2025-09-11|2025-09-11 22:00:41.742998|
+----------+--------------------------+



#### HyperLogLog Aggregation Function

In [7]:
spark.sql("""
SELECT hll_sketch_estimate(hll_sketch_agg(col))
FROM VALUES ("abc"), ("def"), ("abc"), ("ghi"), ("abc") tab(col)
""").show(1, False)

+--------------------------------------------+
|hll_sketch_estimate(hll_sketch_agg(col, 12))|
+--------------------------------------------+
|3                                           |
+--------------------------------------------+



#### New Functions for Manipulating Arrays

In [8]:
def query(sql):
    spark.sql(sql).show(1, False)


queries = [
    "SELECT array_append(array(1, 2, 3), 4)"
    # , "SELECT array_append(array(1, 2, 3), 'ABCD')" # 오류
    , "SELECT array_prepend(array(1, 2, 3), 4)"
    # , "SELECT array_insert(array(1, 2, 3), 0, 9)" # 오류
    , "SELECT array_insert(array(1, 2, 3), 1, 9)"
    , "SELECT array_compact(array(1, NULL, 3))"
]

for q in queries:
    query(q)

+-------------------------------+
|array_append(array(1, 2, 3), 4)|
+-------------------------------+
|[1, 2, 3, 4]                   |
+-------------------------------+

+--------------------------------+
|array_prepend(array(1, 2, 3), 4)|
+--------------------------------+
|[4, 1, 2, 3]                    |
+--------------------------------+

+----------------------------------+
|array_insert(array(1, 2, 3), 1, 9)|
+----------------------------------+
|[9, 1, 2, 3]                      |
+----------------------------------+

+--------------------------------+
|array_compact(array(1, NULL, 3))|
+--------------------------------+
|[1, 3]                          |
+--------------------------------+



### PySpark Features
#### 1. PySpark Existing UDF

In [2]:
df = spark.sql("select 2 as value")

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def square(x):
    return x * x

square_udf = udf(square, IntegerType())
df.withColumn("square", square(df["value"])).show(1, False)

+-----+------+
|value|square|
+-----+------+
|2    |4     |
+-----+------+



#### 2. PySpark New UDF

In [5]:
import pandas as pd

@pandas_udf(IntegerType())
def square_pandas_udf(x: pd.Series) -> pd.Series:
    return x * x

df.withColumn("square", square_pandas_udf(df["value"])).show(1, False)

+-----+------+
|value|square|
+-----+------+
|2    |4     |
+-----+------+



In [22]:
from pyspark.sql import SparkSession, functions as F


data = [
    (1, ["spark", "delta", "iceberg"]),
    (2, ["kafka", "hudi"]),
    (3, []),                  # 빈 배열
    (4, None)                 # null
]
df = spark.createDataFrame(data, "id INT, tags ARRAY<STRING>")

# explode: 배열 → 행으로 확장
out_explode = (df
    .select("id", F.explode("tags").alias("tag"))   # null이나 빈 배열은 행이 안 나옵니다.
)

out_explode.show(truncate=False)
# +---+-------+
# |id |tag    |
# +---+-------+
# |1  |spark  |
# |1  |delta  |
# |1  |iceberg|
# |2  |kafka  |
# |2  |hudi   |
# +---+-------+

# 예: tag 길이 집계
(out_explode
 .withColumn("len", F.length("tag"))
 .groupBy("id")
 .agg(F.collect_list(F.struct("tag","len")).alias("tags_with_len"))
 .show(truncate=False))


+---+-------+
|id |tag    |
+---+-------+
|1  |spark  |
|1  |delta  |
|1  |iceberg|
|2  |kafka  |
|2  |hudi   |
+---+-------+

+---+--------------------------------------+
|id |tags_with_len                         |
+---+--------------------------------------+
|1  |[{spark, 5}, {delta, 5}, {iceberg, 7}]|
|2  |[{kafka, 5}, {hudi, 4}]               |
+---+--------------------------------------+



In [34]:
from pyspark.sql.functions import udtf

@udtf(returnType="tag STRING")
class EmitTags:
    def eval(self, tags: list[str] | None):
        if not tags:    # None / [] 처리
            return
        for t in tags:
            yield (t,)  # 스키마와 동일한 튜플


In [36]:
@udtf(returnType="tag STRING, len INT")
class EmitTagsWithLen:
    def eval(self, tags: list[str] | None):
        if not tags:
            return
        for t in tags:
            yield (t, len(t))


In [39]:
df.printSchema()
df.show(truncate=False)

root
 |-- id: integer (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+-----------------------+
|id |tags                   |
+---+-----------------------+
|1  |[spark, delta, iceberg]|
|2  |[kafka, hudi]          |
|3  |[]                     |
|4  |NULL                   |
+---+-----------------------+



In [61]:
from pyspark.sql.functions import udtf

@udtf(returnType="tag STRING")
class EmitTags:
    def eval(self, tags: list[str] | None):
        if not tags:
            return
        for t in tags:
            yield (t,)

@udtf(returnType="tag STRING, len INT")
class EmitTagsWithLen:
    def eval(self, tags: list[str] | None):
        if not tags:
            return
        for t in tags:
            yield (t, len(t))

print("current DB:", spark.catalog.currentDatabase())

# 등록
spark.udtf.register("default.emit_tags", EmitTags)
spark.udtf.register("default.emit_tags_with_len", EmitTagsWithLen)

df.createOrReplaceTempView("src")

current DB: default


In [87]:

spark.udtf.register("spark_catalog.default.emit_tags", EmitTags)
spark.udtf.register("spark_catalog.default.emit_tags_with_len", EmitTagsWithLen)

spark.sql("""
SELECT current_catalog(), current_schema()
""").show(100, False)

+-----------------+------------------+
|current_catalog()|current_database()|
+-----------------+------------------+
|spark_catalog    |default           |
+-----------------+------------------+



In [88]:
spark.sql("show functions").where(expr("function like '%emit%'")).show(10, False)

+----------------------------------------+
|function                                |
+----------------------------------------+
|emit_tags                               |
|emit_tags_with_len                      |
|spark_catalog.default.emit_tags         |
|spark_catalog.default.emit_tags         |
|spark_catalog.default.emit_tags_with_len|
|spark_catalog.default.emit_tags_with_len|
+----------------------------------------+



In [110]:
spark.sql("""SELECT * FROM src s LATERAL VIEW emit_tags(s.tags) t AS tag """)

AnalysisException: [ROUTINE_NOT_FOUND] The function `default`.`emit_tags` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema and catalog, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP FUNCTION IF EXISTS.; line 1 pos 20

In [None]:
# 스파크 3.5.x 환경에서 정상동작하지 않음

spark.sql("USE spark_catalog")

spark.sql("USE default")

print("spark.version =", spark.version)
spark.sql("SELECT current_catalog() AS cat, current_schema() AS sch").show(truncate=False)

spark.sql("SHOW FUNCTIONS IN default LIKE 'emit_tags*'").show(truncate=False)

spark.sql("""SELECT s.id, t.tag
FROM src s
LATERAL VIEW `spark_catalog`.default.emit_tags(s.tags) t AS tag""").show(truncate=False)

In [93]:
class ReadFromConfigFile:
    @staticmethod
    def analyze(filename: AnalyzeArgument):
        with open(os.path.join(
            SparkFiles.getRootDirectory(),
            filename.value), "r") as f:
            return AnalyzeResult(from_file(f.read()))

NameError: name 'AnalyzeArgument' is not defined