# Creating a Delta Table with SQL DDL

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [2]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [49]:
def sql(queries):
    for query in queries.split(";"):
        spark.sql(query).show(truncate=False)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

In [52]:
sql("show databases ; show tables")

+---------+
|namespace|
+---------+
|default  |
|taxidb   |
+---------+

+---------+-----------+-----------+
|namespace|tableName  |isTemporary|
+---------+-----------+-----------+
|taxidb   |greentaxis |false      |
|taxidb   |ratecard   |false      |
|taxidb   |yellowtaxis|false      |
+---------+-----------+-----------+



In [20]:
creating_delta_table_with_sql_ddl = f"""
CREATE TABLE IF NOT EXISTS delta.`{work_data}/rateCard.delta` (
    rateCodeId INT,
    rateCodeDesc STRING
) USING DELTA
"""

In [21]:
!ls -al {work_data}/rateCard.delta
print(creating_delta_table_with_sql_ddl)

total 4
drwxrwxrwx 1 jovyan 1000 512 Aug 14 10:18 .
drwxrwxrwx 1 jovyan 1000 512 Aug 14 11:38 ..
drwxrwxrwx 1 jovyan 1000 512 Aug 14 10:18 _delta_log
-rwxrwxrwx 1 jovyan 1000 858 Aug 14 10:18 part-00000-6c22f7b9-a139-40a7-b5dd-086d9064e657-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  16 Aug 14 10:18 .part-00000-6c22f7b9-a139-40a7-b5dd-086d9064e657-c000.snappy.parquet.crc

CREATE TABLE IF NOT EXISTS delta.`/home/jovyan/work/data/rateCard.delta` (
    rateCodeId INT,
    rateCodeDesc STRING
) USING DELTA



In [22]:
# 아래의 명령을 수행하면 metastore_db/tmp 경로의 파일이 업데이트 됩니다
spark.sql(creating_delta_table_with_sql_ddl)

In [26]:
creating_database = f"""
CREATE DATABASE IF NOT EXISTS taxidb
"""
print(creating_database)

# 아래의 명령 수행시에 경로는 반드시 singlequote 입니다
creating_delta_table_with_name = f"""
CREATE TABLE IF NOT EXISTS taxidb.rateCard (
    rateCodeId INT,
    rateCodeDesc STRING
) USING DELTA
LOCATION '{work_data}/rateCard.delta'
"""
print(creating_delta_table_with_name)


CREATE DATABASE IF NOT EXISTS taxidb


CREATE TABLE IF NOT EXISTS taxidb.rateCard (
    rateCodeId INT,
    rateCodeDesc STRING
) USING DELTA
LOCATION '/home/jovyan/work/data/rateCard.delta'



In [27]:
sql(creating_database)
sql(creating_delta_table_with_name)

In [32]:
sql("use taxidb; show tables")

++
||
++
++

+---------+-----------+-----------+
|namespace|tableName  |isTemporary|
+---------+-----------+-----------+
|taxidb   |greentaxis |false      |
|taxidb   |ratecard   |false      |
|taxidb   |yellowtaxis|false      |
+---------+-----------+-----------+



In [36]:
ls("./data/rateCard.delta/_delta_log")

total 8
drwxrwxrwx 1 jovyan 1000 512 Aug 14 10:18 .
drwxrwxrwx 1 jovyan 1000 512 Aug 14 10:18 ..
-rwxrwxrwx 1 jovyan 1000 788 Aug 14 10:16 00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000  16 Aug 14 10:16 .00000000000000000000.json.crc
-rwxrwxrwx 1 jovyan 1000 822 Aug 14 10:18 00000000000000000001.json
-rwxrwxrwx 1 jovyan 1000  16 Aug 14 10:18 .00000000000000000001.json.crc


In [39]:
# This metadata action is always written to the first transaction log entry created for our new table
cat("./data/rateCard.delta/_delta_log/00000000000000000000.json")

{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"a40a4266-af73-41ab-8d73-d7e7ab96e31f","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"rateCodeId\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"rateCodeDesc\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1723630609862}}
{"commitInfo":{"timestamp":1723630609999,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.2.1 Delta-Lake/2.0.0","txnId":"ee694a5e-eaf8-4b10-ab57-4b15562518f6"}}


In [41]:
grep("metadata", "./data/rateCard.delta/_delta_log/00000000000000000000.json")

{"metaData":{"id":"a40a4266-af73-41ab-8d73-d7e7ab96e31f","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"rateCodeId\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"rateCodeDesc\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1723630609862}}


In [50]:
# Delta Lake has written the schema of the table to the transaction log entry, together with some auditing and partitioning information
grep_and_json("metadata", "./data/rateCard.delta/_delta_log/00000000000000000000.json")

{
    "metaData": {
        "id": "a40a4266-af73-41ab-8d73-d7e7ab96e31f",
        "format": {
            "provider": "parquet",
            "options": {}
        },
        "schemaString": "{\"type\":\"struct\",\"fields\":[{\"name\":\"rateCodeId\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"rateCodeDesc\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}",
        "partitionColumns": [],
        "configuration": {},
        "createdTime": 1723630609862
    }
}


In [10]:
# When you want to find the Delta Lake-specific attributes, you can also use the DESCRIBE TABLE EXTENDED command,
# which provides more detailed metadata infor‐mation, including the following generic attributes:

spark.sql("describe extended ratecard").show(truncate=False)
spark.sql("select * from ratecard").show(truncate=False)

+----------------------------+-------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                      |comment|
+----------------------------+-------------------------------------------------------------------------------+-------+
|rateCodeId                  |int                                                                            |       |
|rateCodeDesc                |string                                                                         |       |
|                            |                                                                               |       |
|# Partitioning              |                                                                               |       |
|Not partitioned             |                                                                               |       |
|                            |                  

In [12]:
csvRateCard = spark.read.format("csv").option("inferSchema", True).option("header", True).load(f"{work_dir}/data/rateCard.csv")
csvRateCard.printSchema()
csvRateCard.show(truncate=False)

root
 |-- RateCodeID: integer (nullable = true)
 |-- RateCodeDesc: string (nullable = true)

+----------+---------------------+
|RateCodeID|RateCodeDesc         |
+----------+---------------------+
|1         |Standard Rate        |
|2         |JFK                  |
|3         |Newark               |
|4         |Nassau or Westchester|
|5         |Negotiated fare      |
|6         |Group ride           |
+----------+---------------------+



In [13]:
csvRateCard.write.format("delta").mode("overwrite").saveAsTable("taxidb.rateCard")

In [14]:
!ls {work_dir}/data/rateCard.delta/_delta_log
!cat {work_dir}/data/rateCard.delta/_delta_log/00000000000000000001.json

00000000000000000000.json  00000000000000000001.json
{"add":{"path":"part-00000-879dbb89-ee5a-47d4-ab0d-0e6445663607-c000.snappy.parquet","partitionValues":{},"size":858,"modificationTime":1723625669220,"dataChange":true,"stats":"{\"numRecords\":6,\"minValues\":{\"rateCodeId\":1,\"rateCodeDesc\":\"Group ride\"},\"maxValues\":{\"rateCodeId\":6,\"rateCodeDesc\":\"Standard Rate\"},\"nullCount\":{\"rateCodeId\":0,\"rateCodeDesc\":0}}"}}
{"commitInfo":{"timestamp":1723625668130,"operation":"CREATE OR REPLACE TABLE AS SELECT","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"6","numOutputBytes":"858"},"engineInfo":"Apache-Spark/3.2.1 Delta-Lake/2.0.0","txnId":"5e49e955-4dca-4948-89a1-32cf38bb834f"}}


In [30]:
# 테이블 드랍의 경우 모든 메타 정보와 데이터 파일은 유지되며, 데이터베이스에 테이블 정보만 삭제됨
spark.sql("DROP TABLE IF EXISTS taxidb.rateCard")

In [31]:
!ls {work_dir}/data/rateCard.delta/_delta_log
!cat {work_dir}/data/rateCard.delta/_delta_log/00000000000000000001.json

00000000000000000000.json  00000000000000000001.json
{"add":{"path":"part-00000-879dbb89-ee5a-47d4-ab0d-0e6445663607-c000.snappy.parquet","partitionValues":{},"size":858,"modificationTime":1723625669220,"dataChange":true,"stats":"{\"numRecords\":6,\"minValues\":{\"rateCodeId\":1,\"rateCodeDesc\":\"Group ride\"},\"maxValues\":{\"rateCodeId\":6,\"rateCodeDesc\":\"Standard Rate\"},\"nullCount\":{\"rateCodeId\":0,\"rateCodeDesc\":0}}"}}
{"commitInfo":{"timestamp":1723625668130,"operation":"CREATE OR REPLACE TABLE AS SELECT","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"6","numOutputBytes":"858"},"engineInfo":"Apache-Spark/3.2.1 Delta-Lake/2.0.0","txnId":"5e49e955-4dca-4948-89a1-32cf38bb834f"}}


In [51]:
sql("use taxidb ; show tables")

++
||
++
++

+---------+-----------+-----------+
|namespace|tableName  |isTemporary|
+---------+-----------+-----------+
|taxidb   |greentaxis |false      |
|taxidb   |ratecard   |false      |
|taxidb   |yellowtaxis|false      |
+---------+-----------+-----------+



In [None]:
# Generated Columns from spark 3.4.0
spark.sql(f"""
CREATE TABLE taxidb.YellowTaxis (
    RideId INT COMMENT 'This is our primary Key column',
    VendorId INT,
    PickupTime TIMESTAMP,
    PickupYear INT GENERATED ALWAYS AS(YEAR (PickupTime)),
    PickupMonth INT GENERATED ALWAYS AS(MONTH (PickupTime)),
    PickupDay INT GENERATED ALWAYS AS(DAY (PickupTime)),
    DropTime TIMESTAMP,
    CabNumber STRING COMMENT 'Official Yellow Cab Number'
) USING DELTA
LOCATION '{work_dir}/data/YellowTaxis.delta'
COMMENT 'Table to store Yellow Taxi data'
""")

In [None]:
spark.sql(f"""
INSERT INTO taxidb.YellowTaxis (RideId, VendorId, PickupTime, DropTime, CabNumber)
VALUES (5, 101, '2021-7-1T8:43:28UTC+3', '2021-7-1T8:43:28UTC+3', '51-986')
""")
# The expression you use in GENERATED ALWAYS AS can be any Spark SQL function that always returns the same result
# when given the same argument values, with a few exceptions we will touch on soon
spark.sql(f"""
SELECT PickupTime, PickupYear, PickupMonth, PickupDay FROM taxidb.YellowTaxis
""").show(truncate=False)

In [None]:
# Non deterministic function
# uuid() 와 같은 generated column 은 non deterministic expression 으로 사용할 수 없다.
# 아래와 같은 expression 의 경우에도 마찬가지
# - User-defined functions
# - Aggregate functions
# - Window functions
# - Functions returning multiple rows

spark.sql(f"""
CREATE OR REPLACE TABLE default.dummy
    ID STRING GENERATED ALWAYS AS (UUID()),
    Name STRING
) USING DELTA
""")