## 5-3. Liquid Clustering


In [1]:
import findspark
findspark.init()

import os
print(os.environ['JAVA_HOME'])
print(os.environ['SPARK_HOME'])

/usr/lib/jvm/java-11-openjdk-amd64
/usr/local/spark


In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [3]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark.conf.set("spark.sql.decimalOperations.allowPrecisionLoss", "true")
spark

In [5]:
def show(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def sql(query):
    return spark.sql(query)

def history(dbName, tableName):
    return spark.sql("describe history {}.{}".format(dbName, tableName))

def table(dbName, tableName):
    return spark.read.format("delta").table("{}.{}".format(dbName, tableName))

def describe(dbName, tableName, extended = True, num_rows = 20):
    if extended:
        show("describe extended {}.{}".format(dbName, tableName), num_rows)
    else:
        show("describe {}.{}".format(dbName, tableName), num_rows)

def ls(target):
    !ls -al {target}

def ls_and_head(target, lineno):
    !ls -al {target} | grep -v 'crc' | head -{lineno}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

def grep_sed_json(keyword, lineno, filename):
    !grep {keyword} {filename} | sed -n {lineno}p | python -m json.tool


In [6]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType

def dropAndRemoveTable(dbName, tableName):
    location="/home/jovyan/work/spark-warehouse/{}".format(tableName)
    !rm -rf {location}
    sql("DROP TABLE IF EXISTS {}.{}".format(dbName, tableName))

### Q1. 파티션 없는 users (id, firstName, lastName) 테이블의 에서 (middleName) 컬럼을 추가한 상태에서 저장 시에 스키마는 어떻게 되는가?
```python
AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: 76246dde-b128-43df-8fca-605c2dbef282).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.
```

In [32]:
dropAndRemoveTable(dbName, tableName)

schema_v1 = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstName", StringType(), True),
    StructField("lastName", StringType(), True)
])
rows_v1 = []
rows_v1.append(Row(1, "suhyuk", "park"))
rows_v1.append(Row(2, "youngmi", "kim"))

df_v1 = spark.createDataFrame(rows_v1, schema_v1)
df_v1.write.format("delta").mode("overwrite").saveAsTable("{}.{}".format(dbName, tableName))

sql("show tables")
show("describe extended {}.{}".format(dbName, tableName), 100)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|id                          |int                                                             |       |
|firstName                   |string                                                          |       |
|lastName                    |string                                                          |       |
|                            |                                                                |       |
|# Partitioning              |                                                                |       |
|Not partitioned             |                                                                |       |
|                            |                                  

In [29]:
schema_v2 = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstName", StringType(), True),
    StructField("middleName", StringType(), True),
    StructField("lastName", StringType(), True)
])
rows_v2 = []
rows_v2.append(Row(3, "sowon", "eva", "park"))
df_v2 = spark.createDataFrame(rows_v2, schema_v2)
df_v2.write.format("delta").mode("append").saveAsTable("{}.{}".format(dbName, tableName))

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: 76246dde-b128-43df-8fca-605c2dbef282).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- lastName: string (nullable = true)


Data schema:
root
-- id: integer (nullable = true)
-- firstName: string (nullable = true)
-- middleName: string (nullable = true)
-- lastName: string (nullable = true)

         

### Q2. `mergeSchema` 옵션을 주고 저장하면 어떻게 되는가?
> mergeSchema : 스키마가 추가되지만 기존 컬럼의 가장 마지막에 컬럼이 추가된다

In [30]:
df_v2.write.format("delta").mode("append").option("mergeSchema", True).saveAsTable("{}.{}".format(dbName, tableName))

sql("show tables")
show("describe extended {}.{}".format(dbName, tableName), 100)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|id                          |int                                                             |       |
|firstName                   |string                                                          |       |
|lastName                    |string                                                          |       |
|middleName                  |string                                                          |       |
|                            |                                                                |       |
|# Partitioning              |                                                                |       |
|Not partitioned             |                                  

### Q3. `overwriteSchema` 옵션을 주고 저장하면 어떻게 되는가?
> overwriteSchema : 덮어쓰기 모드(overwrite)에서만 사용할 수 있으며 기존 데이터가 모두 사라짐에 주의해야 한다

In [33]:
sql("select * from {}.{}".format(dbName, tableName))

id,firstName,lastName
1,suhyuk,park
2,youngmi,kim


In [35]:
# Q1 예제를 다시 실행하고

schema_v3 = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstName", StringType(), True),
    StructField("middleName", StringType(), True),
    StructField("lastName", StringType(), True)
])
rows_v3 = []
rows_v3.append(Row(3, "sowon", "eva", "park"))
rows_v3.append(Row(4, "sihun", "sean", "park"))
df_v3 = spark.createDataFrame(rows_v3, schema_v3)
df_v3.write.format("delta").mode("overwrite").option("overwriteSchema", True).saveAsTable("{}.{}".format(dbName, tableName))

sql("show tables")
show("describe extended {}.{}".format(dbName, tableName), 100)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|id                          |int                                                             |       |
|firstName                   |string                                                          |       |
|middleName                  |string                                                          |       |
|lastName                    |string                                                          |       |
|                            |                                                                |       |
|# Partitioning              |                                                                |       |
|Not partitioned             |                                  

In [36]:
sql("select * from {}.{}".format(dbName, tableName))

id,firstName,middleName,lastName
4,sihun,sean,park
3,sowon,eva,park


### Q4. 파티션이 존재하는 상태에서 users 테이블에 다른 파티션에만 middleName 컬럼이 추가되는 경우?
> 어차피 델타 테이블의 경우 파티션 단위로 파일이 저장될 뿐, 특정 경로를 읽어내는 경우는 없으며, 스키마 또한 통합되어 관리되기 때문에 overwrite 혹은 merge 둘 중에 하나만 고민하면 된다

In [37]:
dropAndRemoveTable(dbName, tableName)

schema_v4 = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstName", StringType(), True),
    StructField("lastName", StringType(), True)
])
rows_v4 = []
rows_v4.append(Row(1, "suhyuk", "park"))
rows_v4.append(Row(3, "sowon", "park"))
rows_v4.append(Row(4, "sean", "park"))

df_v4 = spark.createDataFrame(rows_v1, schema_v1)
df_v4.write.format("delta").mode("overwrite").partitionBy("lastName").saveAsTable("{}.{}".format(dbName, tableName))

show("describe extended {}.{}".format(dbName, tableName), 100)
sql("select * from {}.{}".format(dbName, tableName))

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|id                          |int                                                             |       |
|firstName                   |string                                                          |       |
|lastName                    |string                                                          |       |
|                            |                                                                |       |
|# Partitioning              |                                                                |       |
|Part 0                      |lastName                                                        |       |
|                            |                                  

id,firstName,lastName
2,youngmi,kim
1,suhyuk,park


In [39]:
schema_v4a = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstName", StringType(), True),
    StructField("middleName", StringType(), True),
    StructField("lastName", StringType(), True)
])
rows_v4a = []
rows_v4a.append(Row(2, "youngmi", "kiki", "kim"))
df_v4a = spark.createDataFrame(rows_v4a, schema_v4a)
df_v4a.write.format("delta").mode("append").option("mergeSchema", True).partitionBy("lastName").saveAsTable("{}.{}".format(dbName, tableName))

In [42]:
show("describe extended {}.{}".format(dbName, tableName), 100)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|id                          |int                                                             |       |
|firstName                   |string                                                          |       |
|lastName                    |string                                                          |       |
|middleName                  |string                                                          |       |
|                            |                                                                |       |
|# Partitioning              |                                                                |       |
|Part 0                      |lastName                          

### Q5. 데이터 변경이 존재하는 변경 시에 dataChange = false 주게 되는 경우?
> 최초 테이블 생성 시에 저장되는 히스토리 정보 확인 후, append 이후에 히스토리 내역을 보면 `islocationLevel` 이 `Serializable` 에서 `SnapshotIsolation` 으로 변경

In [48]:
dropAndRemoveTable(dbName, tableName)

schema_v5 = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstName", StringType(), True),
    StructField("lastName", StringType(), True)
])
rows_v5 = []
rows_v5.append(Row(1, "suhyuk", "park"))

df_v5 = spark.createDataFrame(rows_v5, schema_v5)
df_v5.write.format("delta").mode("overwrite").partitionBy("lastName").saveAsTable("{}.{}".format(dbName, tableName))

show("describe extended {}.{}".format(dbName, tableName), 100)
history(dbName, tableName)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|id                          |int                                                             |       |
|firstName                   |string                                                          |       |
|lastName                    |string                                                          |       |
|                            |                                                                |       |
|# Partitioning              |                                                                |       |
|Part 0                      |lastName                                                        |       |
|                            |                                  

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2024-10-28 14:30:08.489,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 716}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [49]:
rows_v5a = []
rows_v5a.append(Row(2, "youngmi", "kim"))
df_v5a = spark.createDataFrame(rows_v5a, schema_v5)
df_v5a.write.format("delta").mode("append").option("dataChange", True).partitionBy("lastName").saveAsTable("{}.{}".format(dbName, tableName))

sql("select * from {}.{}".format(dbName, tableName))
history(dbName, tableName)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2024-10-28 14:30:14.594,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,0.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 723}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2024-10-28 14:30:08.489,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 716}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [50]:
rows_v5b = []
rows_v5b.append(Row(3, "sowon", "park"))
rows_v5b.append(Row(4, "sean", "park"))
df_v5b = spark.createDataFrame(rows_v5b, schema_v5)
df_v5b.write.format("delta").mode("append").option("dataChange", False).partitionBy("lastName").saveAsTable("{}.{}".format(dbName, tableName))

sql("select * from {}.{}".format(dbName, tableName))
history(dbName, tableName)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2024-10-28 14:30:25.965,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,SnapshotIsolation,True,"{numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 1411}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2024-10-28 14:30:14.594,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,0.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 723}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2024-10-28 14:30:08.489,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 716}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


### Q6. 데이터 변경이 존재하지 않는 경우에 dataChange = false 주는 경우?
> 최초 테이블 생성 시에 저장되는 히스토리 정보 확인 후, 마찬가지로 `islocationLevel` 이 `Serializable` 에서 `SnapshotIsolation` 으로 변경

In [54]:
df_v6 = table(dbName, tableName)
df_v6.printSchema()
df_v6.show(10, truncate=False)
history(dbName, tableName)

root
 |-- id: integer (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)

+---+---------+--------+
|id |firstName|lastName|
+---+---------+--------+
|2  |youngmi  |kim     |
|1  |suhyuk   |park    |
|3  |sowon    |park    |
|4  |sean     |park    |
+---+---------+--------+



version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2024-10-28 14:30:25.965,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,SnapshotIsolation,True,"{numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 1411}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2024-10-28 14:30:14.594,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,0.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 723}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2024-10-28 14:30:08.489,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 716}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [None]:
df_v6.repartition(1).write.option("dataChange", True).format("delta").mode("overwrite").saveAsTable("{}.{}".format(dbName, tableName))
history(dbName, tableName)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2024-10-28 14:36:58.225,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,2.0,Serializable,False,"{numFiles -> 2, numOutputRows -> 4, numOutputBytes -> 1443}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2024-10-28 14:30:25.965,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,SnapshotIsolation,True,"{numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 1411}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2024-10-28 14:30:14.594,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,0.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 723}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2024-10-28 14:30:08.489,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 716}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [60]:
df_v6.repartition(4).write.option("dataChange", False).format("delta").mode("overwrite").saveAsTable("{}.{}".format(dbName, tableName))
history(dbName, tableName)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2024-10-28 14:39:56.626,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,3.0,SnapshotIsolation,False,"{numFiles -> 4, numOutputRows -> 4, numOutputBytes -> 2850}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
3,2024-10-28 14:36:58.225,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,2.0,Serializable,False,"{numFiles -> 2, numOutputRows -> 4, numOutputBytes -> 1443}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2024-10-28 14:30:25.965,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,SnapshotIsolation,True,"{numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 1411}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2024-10-28 14:30:14.594,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,0.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 723}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2024-10-28 14:30:08.489,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""lastName""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 716}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [11]:
dbName="default"
tableName="pusan_popular_cluster_by"

In [12]:
pusan_popular_trip = spark.read.format("parquet").load("data/pusan_popular_trip")
pusan_popular_trip.write.option("overwrite", True).format("delta").saveAsTable("{}.{}".format(dbName, tableName))

In [13]:
sql("ALTER TABLE {}.{} CLUSTER BY (id, name)".format(dbName, tableName))
describe(dbName, tableName)

ParseException: 
no viable alternative at input 'ALTER TABLE default.pusan_popular_cluster_by CLUSTER'(line 1, pos 45)

== SQL ==
ALTER TABLE default.pusan_popular_cluster_by CLUSTER BY (id, name)
---------------------------------------------^^^


In [14]:
dropAndRemoveTable(dbName, tableName)

In [18]:
pusan_popular_trip = sql("select * from default.pusan_popular_trip")
pusan_popular_trip.count()

1956

In [27]:
show("show tables")
describe("default", "pusan_popular_trip")

+---------+--------------------+-----------+
|namespace|tableName           |isTemporary|
+---------+--------------------+-----------+
|default  |delta_v1            |false      |
|default  |delta_v2            |false      |
|default  |family              |false      |
|default  |pusan_popular_trip  |false      |
|default  |pusan_popular_zorder|false      |
|default  |users               |false      |
+---------+--------------------+-----------+

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|category                    |int                                                             |       |
|id                          |int                                                             |       |
|name                        

In [29]:
pusan_popular_trip = spark.read.table("default.pusan_popular_trip")
pusan_popular_trip.write.format("delta").mode("overwrite").clusterBy("id,name").saveAsTable("{}.{}".format(dbName, tableName))

AttributeError: 'DataFrameWriter' object has no attribute 'clusterBy'

In [31]:
# cluster-by 구문은 delta lake 3.x 부터 지원하며 현재는 2.x 임
createTable = (
    DeltaTable.create()
    .tableName("{}.{}".format(dbName, tableName))
    .addColumn("category", dataType = "int")
    .addColumn("id", dataType = "int")
    .addColumn("name", dataType = "string")
    .addColumn("address", dataType = "string")
    .addColumn("naddress", dataType = "string")
    .addColumn("tel", dataType = "string")
    .addColumn("tag", dataType = "string")
    .addColumn("exp", dataType = "decimal(38,0)")
    .clusterBy("id", "name")
)

AttributeError: 'DeltaTableBuilder' object has no attribute 'clusterBy'

In [None]:
createTable.execute()