In [1]:
!pip install grpcio grpcio-status

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [39]:
from pyspark.sql.connect.session import SparkSession

spark = (
    SparkSession.builder
    .remote("sc://connect-server:15002")   # docker 네트워크 서비스명: spark
    .appName("connect-hello")
    .getOrCreate()
)

df = spark.range(0, 10).withColumnRenamed("id", "num")
df.show(1, False)
spark.sql("select current_date() as today, current_timestamp() as now").createOrReplaceTempView("current_time")

+---+
|num|
+---+
|0  |
+---+
only showing top 1 row



#### The IDENTIFER Clause

In [12]:
spark.sql("""
show tables
""").show(10, False)

+---------+------------+-----------+
|namespace|tableName   |isTemporary|
+---------+------------+-----------+
|         |current_time|true       |
+---------+------------+-----------+



In [31]:
df = spark.sql(
    "select * from IDENTIFIER(:tbl)",
    args = { "tbl": "current_time" }
)
df.printSchema()
df.show(1, False)

root
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)

+----------+--------------------------+
|today     |now                       |
+----------+--------------------------+
|2025-09-10|2025-09-10 14:46:46.838592|
+----------+--------------------------+



#### HyperLogLog Aggregation Function

In [29]:
spark.sql("""
SELECT hll_sketch_estimate(hll_sketch_agg(col))
FROM VALUES ("abc"), ("def"), ("abc"), ("ghi"), ("abc") tab(col)
""").show(1, False)

+--------------------------------------------+
|hll_sketch_estimate(hll_sketch_agg(col, 12))|
+--------------------------------------------+
|3                                           |
+--------------------------------------------+



#### New Functions for Manipulating Arrays

In [28]:
def query(sql):
    spark.sql(sql).show(1, False)


queries = [
    "SELECT array_append(array(1, 2, 3), 4)"
    # , "SELECT array_append(array(1, 2, 3), 'ABCD')" # 오류
    , "SELECT array_prepend(array(1, 2, 3), 4)"
    # , "SELECT array_insert(array(1, 2, 3), 0, 9)" # 오류
    , "SELECT array_insert(array(1, 2, 3), 1, 9)"
    , "SELECT array_compact(array(1, NULL, 3))"
]

for q in queries:
    query(q)

+-------------------------------+
|array_append(array(1, 2, 3), 4)|
+-------------------------------+
|[1, 2, 3, 4]                   |
+-------------------------------+

+--------------------------------+
|array_prepend(array(1, 2, 3), 4)|
+--------------------------------+
|[4, 1, 2, 3]                    |
+--------------------------------+

+----------------------------------+
|array_insert(array(1, 2, 3), 1, 9)|
+----------------------------------+
|[9, 1, 2, 3]                      |
+----------------------------------+

+--------------------------------+
|array_compact(array(1, NULL, 3))|
+--------------------------------+
|[1, 3]                          |
+--------------------------------+



### PySpark Features

#### 1. PySpark UDF 실행방식 변경
> spark-connect 사용 시에는 udf 통한 사용은 문제가 되는 것 같다

In [35]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def square(x):
    return x * x

square_udf = udf(square, IntegerType())
df = spark.sql("select 2 as value")

In [40]:
df.withColumn("square", square_udf(df["value"])).show()

RuntimeError: SparkContext or SparkSession should be created first.

In [3]:
import os
import pandas as pd

os.makedirs("/data/input", exist_ok=True)
pdf = pd.DataFrame({
    "user_id": [1,2,3,4,5,6,7,8,9,10],
    "age": [23,45,31,55,29,40,33,27,48,36],
    "city": ["Seoul","Busan","Seoul","Incheon","Daegu","Seoul","Busan","Daejeon","Gwangju","Seoul"],
    "amount": [100,200,150,300,120,180,220,80,90,260]
})
pdf.to_csv("/data/input/users.csv", index=False)

In [4]:
df = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .csv("/data/input/users.csv")
)

df.printSchema()
df.show(5)

# 필터 + 집계 + explain
res = (df
       .select("city", "amount")
       .where("amount >= 150")
       .groupBy("city")
       .sum("amount")
       .orderBy("sum(amount)", ascending=False))

print(res.explain())   # 실행 계획 확인 (필터/프로젝션 푸시다운, AQE 등 확인)
res.show()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- amount: integer (nullable = true)

+-------+---+-------+------+
|user_id|age|   city|amount|
+-------+---+-------+------+
|      1| 23|  Seoul|   100|
|      2| 45|  Busan|   200|
|      3| 31|  Seoul|   150|
|      4| 55|Incheon|   300|
|      5| 29|  Daegu|   120|
+-------+---+-------+------+
only showing top 5 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [sum(amount)#174L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(sum(amount)#174L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=146]
      +- HashAggregate(keys=[city#167], functions=[sum(amount#168)])
         +- Exchange hashpartitioning(city#167, 200), ENSURE_REQUIREMENTS, [plan_id=143]
            +- HashAggregate(keys=[city#167], functions=[partial_sum(amount#168)])
               +- Filter (isnotnull(amount#168) AND (amount#168 >= 150))
                  +- FileSc

In [5]:
# 읽기(RO)
df = (
    spark.read.option("header", True).option("inferSchema", True)
    .csv("file:///data/input/users.csv")
)
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- amount: integer (nullable = true)



In [6]:
# 처리 & 쓰기(RW, 서버만 씀)
out_path = "file:///data/output/users_by_city"
(
    df.groupBy("city").sum("amount")
   .write.mode("overwrite").parquet(out_path)
)
output = spark.read.parquet(out_path)
output.printSchema()

root
 |-- city: string (nullable = true)
 |-- sum(amount): long (nullable = true)



In [7]:
# 테이블로도 저장 (서버 전용 웨어하우스)
spark.sql("CREATE DATABASE IF NOT EXISTS demo")
(
    df.write.mode("overwrite").saveAsTable("demo.users")
)
spark.sql("SELECT city, AVG(amount) avg_amt FROM demo.users GROUP BY city").show()

+-------+-------+
|   city|avg_amt|
+-------+-------+
|Gwangju|   90.0|
|  Daegu|  120.0|
|Incheon|  300.0|
|  Busan|  210.0|
|Daejeon|   80.0|
|  Seoul|  172.5|
+-------+-------+

