# Creating Delta Tables with the DataFrameWriter API

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [2]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [4]:
def sql(queries):
    for query in queries.split(";"):
        spark.sql(query).show(truncate=False)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

In [7]:
sql("show databases ; use taxidb ; show tables")

+---------+
|namespace|
+---------+
|default  |
|taxidb   |
+---------+

++
||
++
++

+---------+-----------+-----------+
|namespace|tableName  |isTemporary|
+---------+-----------+-----------+
|taxidb   |greentaxis |false      |
|taxidb   |ratecard   |false      |
|taxidb   |yellowtaxis|false      |
+---------+-----------+-----------+



In [8]:
# DeltaTable API 수준에서 Drop 하는 API 제공되지 않음 - default 는 대소문자를 가리지 않음 (greentaxis)
sql("""
DROP TABLE IF EXISTS taxidb.greenTaxis
""")

++
||
++
++



In [9]:
deltaTable = (
    DeltaTable.createIfNotExists(spark)
    .tableName("taxidb.greenTaxis")
    .addColumn("VendorId", "INT", comment = "Ride Vendor")
    .addColumn("lpep_pickup_datetime", "STRING")
    .addColumn("lpep_dropoff_datetime", "STRING")
    .addColumn("store_and_fwd_flag", "STRING")
    .addColumn("RatecodeID", "INT", comment = "Ref to RateCard")
    .addColumn("PULocationID", "INT")
    .addColumn("DOLocationID", "INT")
    .addColumn("passenger_count", "INT")
    .addColumn("trip_distance", "DOUBLE")
    .addColumn("fare_amount", "DOUBLE")
    .addColumn("extra", "DOUBLE")
    .addColumn("mta_tax", "DOUBLE")
    .addColumn("tip_amount", "DOUBLE")
    .addColumn("tolls_amount", "DOUBLE")
    .addColumn("ehail_fee", "STRING")
    .addColumn("improvement_surcharge", "DOUBLE")
    .addColumn("total_amount", "DOUBLE")
    .addColumn("payment_type", "INT")
    .addColumn("trip_type", "INT")
    .addColumn("congestion_surcharge", "DOUBLE")
)
deltaTable.execute()

<delta.tables.DeltaTable at 0x7f0cd1795130>

In [12]:
ls(f"{work_dir}/spark-warehouse/taxidb.db/greentaxis/_delta_log")

total 4
drwxrwxrwx 1 jovyan 1000  512 Aug 28 05:07 .
drwxrwxrwx 1 jovyan 1000  512 Aug 28 05:07 ..
-rwxrwxrwx 1 jovyan 1000 2346 Aug 28 05:07 00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000   28 Aug 28 05:07 .00000000000000000000.json.crc


In [14]:
csvGreenTaxi = spark.read.format("csv").option("inferSchema", True).option("header", True).load(f"{work_dir}/data/greenTaxis.csv")
csvGreenTaxi.printSchema()
csvGreenTaxi.show(1, truncate=False)

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)

+--------+--------------------+---------------------+------------------+----------+-------

In [15]:
# 실제 저장 시에는 greentaxis 와 같이 소문자로 저장됨
csvGreenTaxi.write.format("delta").mode("overwrite").saveAsTable("taxidb.greenTaxis")

In [20]:
ls(f"{work_dir}/spark-warehouse/taxidb.db/greentaxis/_delta_log")
grep_and_json("46087", f"{work_dir}/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000001.json")

total 20
drwxrwxrwx 1 jovyan 1000   512 Aug 28 05:08 .
drwxrwxrwx 1 jovyan 1000   512 Aug 28 05:08 ..
-rwxrwxrwx 1 jovyan 1000  2346 Aug 28 05:07 00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000    28 Aug 28 05:07 .00000000000000000000.json.crc
-rwxrwxrwx 1 jovyan 1000 15695 Aug 28 05:08 00000000000000000001.json
-rwxrwxrwx 1 jovyan 1000   132 Aug 28 05:08 .00000000000000000001.json.crc
{
    "add": {
        "path": "part-00000-f7d57c66-f63d-4b6f-b173-431f4b70c855-c000.snappy.parquet",
        "partitionValues": {},
        "size": 930701,
        "modificationTime": 1724821731766,
        "dataChange": true,
        "stats": "{\"numRecords\":46087,\"minValues\":{\"VendorId\":1,\"lpep_pickup_datetime\":\"2009-01-01 00:10:36\",\"lpep_dropoff_datetime\":\"2009-01-01 00:20:29\",\"store_and_fwd_flag\":\"N\",\"RatecodeID\":1,\"PULocationID\":2,\"DOLocationID\":1,\"passenger_count\":0,\"trip_distance\":0.0,\"fare_amount\":-52.0,\"extra\":-1.0,\"mta_tax\":-0.5,\"tip_amount\":-0.76,\"tolls_

In [21]:
# External Table 의 경우 Drop 시에 모든 메타 정보와 데이터 파일은 유지되며, 데이터베이스에 테이블 정보만 삭제됨
sql("DROP TABLE IF EXISTS taxidb.rateCard")

++
||
++
++



In [23]:
sql("use taxidb ; show tables")
ls("./data/ratecard.delta")

++
||
++
++

+---------+-----------+-----------+
|namespace|tableName  |isTemporary|
+---------+-----------+-----------+
|taxidb   |greentaxis |false      |
|taxidb   |yellowtaxis|false      |
+---------+-----------+-----------+

total 4
drwxrwxrwx 1 jovyan 1000 512 Aug 14 10:18 .
drwxrwxrwx 1 jovyan 1000 512 Aug 14 11:38 ..
drwxrwxrwx 1 jovyan 1000 512 Aug 14 10:18 _delta_log
-rwxrwxrwx 1 jovyan 1000 858 Aug 14 10:18 part-00000-6c22f7b9-a139-40a7-b5dd-086d9064e657-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  16 Aug 14 10:18 .part-00000-6c22f7b9-a139-40a7-b5dd-086d9064e657-c000.snappy.parquet.crc


In [31]:
!ls {work_dir}/data/rateCard.delta/_delta_log
!cat {work_dir}/data/rateCard.delta/_delta_log/00000000000000000001.json

00000000000000000000.json  00000000000000000001.json
{"add":{"path":"part-00000-879dbb89-ee5a-47d4-ab0d-0e6445663607-c000.snappy.parquet","partitionValues":{},"size":858,"modificationTime":1723625669220,"dataChange":true,"stats":"{\"numRecords\":6,\"minValues\":{\"rateCodeId\":1,\"rateCodeDesc\":\"Group ride\"},\"maxValues\":{\"rateCodeId\":6,\"rateCodeDesc\":\"Standard Rate\"},\"nullCount\":{\"rateCodeId\":0,\"rateCodeDesc\":0}}"}}
{"commitInfo":{"timestamp":1723625668130,"operation":"CREATE OR REPLACE TABLE AS SELECT","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"6","numOutputBytes":"858"},"engineInfo":"Apache-Spark/3.2.1 Delta-Lake/2.0.0","txnId":"5e49e955-4dca-4948-89a1-32cf38bb834f"}}


In [16]:
spark.sql("use taxidb")
spark.sql("show tables")

namespace,tableName,isTemporary
taxidb,greentaxis,False
taxidb,ratecard,False


In [18]:
spark.sql("""
select * from taxidb.greentaxis
""").show(5, truncate=False)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|2       |2019-12-19 17:12:23 |2019-12-19 17:19:32  |N                 |1         |181         |25          |1              |1.28         |7.0        |1.0  |0.5   

In [19]:
spark.sql("select count(1) from taxidb.greentaxis")

count(1)
450627


In [23]:
spark.sql("""
SELECT VendorID, AVG(fare_amount) AS AverageFare
FROM taxidb.greentaxis
GROUP BY VendorID
HAVING AVG(fare_amount) > 10
ORDER BY 2 DESC
LIMIT 5
""")

VendorID,AverageFare
,28.830541636814186
1.0,13.217967034800123
2.0,12.054380250700095


In [24]:
# Reading a Table with PySpark
df = spark.read.format("delta").table("taxidb.greentaxis")
print(f"Number of records: {df.count():,}")

Number of records: 450,627


In [25]:
res = (
    df.groupBy("VendorID")
    .agg(avg("fare_amount").alias("AverageFare"))
    .filter(col("AverageFare") > 10)
    .sort(col("AverageFare").desc())
)
res.take(5)

[Row(VendorID=None, AverageFare=28.830541636814186),
 Row(VendorID=1, AverageFare=13.217967034800125),
 Row(VendorID=2, AverageFare=12.054380250700097)]

In [27]:
spark.sql(f"""
CREATE TABLE taxidb.YellowTaxis (
    RideId INT,
    VendorId INT,
    PickupTime TIMESTAMP,
    DropTime TIMESTAMP,
    PickupLocationId INT,
    DropLocationId INT,
    CabNumber STRING,
    DriverLicenseNumber STRING,
    PassengerCount INT,
    TripDistance DOUBLE,
    RatecodeId INT,
    PaymentType INT,
    TotalAmount DOUBLE,
    FareAmount DOUBLE,
    Extra DOUBLE,
    MtaTax DOUBLE,
    TipAmount DOUBLE,
    TollsAmount DOUBLE,
    ImprovementSurcharge DOUBLE
) USING DELTA
LOCATION "{work_dir}/data/yellowTaxis.delta"
""")

In [28]:
spark.sql("""
INSERT INTO taxidb.yellowtaxis (RideId, VendorId, PickupTime, DropTime, PickupLocationId,
    DropLocationId, CabNumber, DriverLicenseNumber, PassengerCount, TripDistance,
    RatecodeId, PaymentType, TotalAmount, FareAmount, Extra, 
    MtaTax, TipAmount, TollsAmount, ImprovementSurcharge)
VALUES (9999995, 1, '2019-11-01T00:00:00.000Z', '2019-11-01T00:02:23.573Z', 65, 
    71, 'TAC304', '453987', 2, 4.5, 
    1, 1, 20.34, 15.0, 0.5, 
    0.4, 2.0, 2.0, 1.1)
""")
spark.sql("SELECT count(RideId) AS count FROM taxidb.YellowTaxis WHERE RideId = 9999995")

count
1


### Generated Columns
> which are a special type of column, the values of which are automatically generated based on a user-specified function over other
> columns in the Delta table. When you write to a Delta table with generated columns
> and don’t explicitly provide values for them, Delta Lake automatically computes the values.

* 델타 테이블이 자동 생성해서 넣어주는 컬럼 값을 말하며, delta 최신 버전 (spark 3.5)이 필요하며, 현재 컨테이너에서는 사용할 수 없음

```sql
-- GENERATED 키워드로 기본값 함수를 제공
CREATE TABLE taxidb.YellowTaxis (
    RideId INT COMMENT 'This is our primary Key column',
    VendorId INT,
    PickupTime TIMESTAMP,
    PickupYear INT GENERATED ALWAYS AS(YEAR (PickupTime)),
    PickupMonth INT GENERATED ALWAYS AS(MONTH (PickupTime)),
    PickupDay INT GENERATED ALWAYS AS(DAY (PickupTime)),
    DropTime TIMESTAMP,
    CabNumber STRING COMMENT 'Official Yellow Cab Number'
) USING DELTA
LOCATION "/mnt/datalake/book/chapter03/YellowTaxis.delta"
COMMENT 'Table to store Yellow Taxi data'

-- 아래와 같이 필요한 값만 넣어주면 된다
INSERT INTO taxidb.YellowTaxis (RideId, VendorId, PickupTime, DropTime, CabNumber)
VALUES (5, 101, '2021-7-1T8:43:28UTC+3', '2021-7-1T8:43:28UTC+3', '51-986')
```

* 단, 아래와 같이 비결정적인 함수는 사용할 수 없다

```sql
CREATE OR REPLACE TABLE default.dummy (
    ID STRING GENERATED ALWAYS AS (UUID()),
    Name STRING
) USING DELTA

-- Found uuid(). A generated column cannot use a non deterministic expression.
-- User-defined functions
-- Aggregate functions
-- Window functions
-- Functions returning multiple rows
```