## 5-2. ZORDER BY


In [1]:
import findspark
findspark.init()

import os
print(os.environ['JAVA_HOME'])
print(os.environ['SPARK_HOME'])

/usr/lib/jvm/java-11-openjdk-amd64
/usr/local/spark


In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.databricks.delta.retentionDurationCheck.enabled", "true")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [3]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark.conf.set("spark.sql.decimalOperations.allowPrecisionLoss", "true")
spark

In [24]:
def show(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def sql(query):
    return spark.sql(query)

def history(dbName, tableName):
    return spark.sql("describe history {}.{}".format(dbName, tableName))

def table(dbName, tableName):
    return spark.read.format("delta").table("{}.{}".format(dbName, tableName))

def describe(dbName, tableName, extended = True, num_rows = 20):
    if extended:
        show("describe extended {}.{}".format(dbName, tableName), num_rows)
    else:
        show("describe {}.{}".format(dbName, tableName), num_rows)

def ls(target):
    !ls -al {target}

def ls_and_head(target, lineno):
    !ls -al {target} | grep -v 'crc' | head -{lineno}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

def grep_sed_json(keyword, lineno, filename):
    !grep {keyword} {filename} | sed -n {lineno}p | python -m json.tool


In [13]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType

def dropAndRemoveTable(dbName, tableName):
    location="/home/jovyan/work/spark-warehouse/{}".format(tableName)
    !rm -rf {location}
    sql("DROP TABLE IF EXISTS {}.{}".format(dbName, tableName))

In [15]:
dbName="default"
tableName="pusan_popular_trip"

pusan_popular_trip = spark.read.format("parquet").load("data/pusan_popular_trip")
pusan_popular_trip.write.option("overwrite", True).format("delta").saveAsTable("{}.{}".format(dbName, tableName))

In [16]:
sql("show tables")

namespace,tableName,isTemporary
default,delta_v1,False
default,delta_v2,False
default,family,False
default,pusan_popular_trip,False
default,users,False


In [20]:
sql("select count(1) from {}.{}".format(dbName, tableName))

count(1)
1956


In [46]:
tableName="pusan_popular_trip"
show("explain select * from {}.{} where id = 281".format(dbName, tableName), 100)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|plan                                                                                                                                                                                                                                                                                                                                                                                                

In [31]:
describe(dbName, tableName)
show("select id, name from {}.{}".format(dbName, tableName), 5)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|category                    |int                                                             |       |
|id                          |int                                                             |       |
|name                        |string                                                          |       |
|address                     |string                                                          |       |
|naddress                    |string                                                          |       |
|tel                         |string                                                          |       |
|tag                         |string                            

In [47]:
dbName="default"
tableName="pusan_popular_zorder"

In [27]:
pusan_popular_trip = spark.read.format("parquet").load("data/pusan_popular_trip")
pusan_popular_trip.write.option("overwrite", True).format("delta").saveAsTable("{}.{}".format(dbName, tableName))

In [28]:
sql("show tables")

namespace,tableName,isTemporary
default,delta_v1,False
default,delta_v2,False
default,family,False
default,pusan_popular_trip,False
default,pusan_popular_zorder,False
default,users,False


In [32]:
ls("./spark-warehouse/{}/".format(tableName))

total 252
drwxrwxrwx 1 jovyan 1000    512 Nov  5 06:58 .
drwxrwxrwx 1 jovyan 1000    512 Nov  5 06:58 ..
drwxrwxrwx 1 jovyan 1000    512 Nov  5 06:58 _delta_log
-rwxrwxrwx 1 jovyan 1000 252567 Nov  5 06:58 part-00000-d635d5cc-6681-446b-a173-c39ab3285aef-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000   1984 Nov  5 06:58 .part-00000-d635d5cc-6681-446b-a173-c39ab3285aef-c000.snappy.parquet.crc


In [33]:
sql("OPTIMIZE {}.{} ZORDER BY id, name".format(dbName, tableName))

path,metrics
file:/home/jovyan/work/spark-warehouse/pusan_popular_zorder,"{1, 1, {252567, 252567, 252567.0, 1, 252567}, {252567, 252567, 252567.0, 1, 252567}, 1, {all, {0,..."


In [34]:
ls("./spark-warehouse/{}/".format(tableName))

total 504
drwxrwxrwx 1 jovyan 1000    512 Nov  5 07:02 .
drwxrwxrwx 1 jovyan 1000    512 Nov  5 06:58 ..
drwxrwxrwx 1 jovyan 1000    512 Nov  5 07:02 _delta_log
-rwxrwxrwx 1 jovyan 1000 252567 Nov  5 07:02 part-00000-90eddbb6-565e-4d34-a4c0-9ecd828e5637-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000   1984 Nov  5 07:02 .part-00000-90eddbb6-565e-4d34-a4c0-9ecd828e5637-c000.snappy.parquet.crc
-rwxrwxrwx 1 jovyan 1000 252567 Nov  5 06:58 part-00000-d635d5cc-6681-446b-a173-c39ab3285aef-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000   1984 Nov  5 06:58 .part-00000-d635d5cc-6681-446b-a173-c39ab3285aef-c000.snappy.parquet.crc


In [44]:
show("explain select * from {}.{} where id = 281".format(dbName, tableName))

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|plan                                                                                                                                                                                                                                                                                                                                                                                            

In [19]:
sql("VACUUM {}".format(tableName))

path
file:/home/jovyan/work/spark-warehouse/delta_v1


In [20]:
ls("./spark-warehouse/{}/".format(tableName))

total 4
drwxrwxrwx 1 jovyan 1000 512 Oct 29  2024 .
drwxrwxrwx 1 jovyan 1000 512 Oct 29 04:55 ..
drwxrwxrwx 1 jovyan 1000 512 Oct 29 04:56 _delta_log
-rwxrwxrwx 1 jovyan 1000 382 Oct 29 04:56 part-00000-0e13db32-fe48-4793-826e-df135f4e0762-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  12 Oct 29 04:56 .part-00000-0e13db32-fe48-4793-826e-df135f4e0762-c000.snappy.parquet.crc
-rwxrwxrwx 1 jovyan 1000 701 Oct 29 04:56 part-00001-e66155f1-5323-4090-ab29-c3c571193ed3-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  16 Oct 29 04:56 .part-00001-e66155f1-5323-4090-ab29-c3c571193ed3-c000.snappy.parquet.crc


In [48]:
history(dbName, tableName)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2024-11-05 16:02:05.926,,,OPTIMIZE,"{predicate -> [], zOrderBy -> [""id"",""name""]}",,,,0.0,SnapshotIsolation,False,"{numRemovedFiles -> 1, numRemovedBytes -> 252567, p25FileSize -> 252567, minFileSize -> 252567, n...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2024-11-05 15:58:39.034,,,CREATE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,,Serializable,True,"{numFiles -> 1, numOutputRows -> 1956, numOutputBytes -> 252567}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
