# Writing to a Delta Table

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [2]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [4]:
def sql(queries, num_rows=20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

In [5]:
spark.sql("use taxidb")
spark.sql("show tables")

namespace,tableName,isTemporary
taxidb,greentaxis,False
taxidb,yellowtaxis,False


### Cleaning Out the YellowTaxis Table

In [6]:
spark.sql("select count(1) from taxidb.greentaxis")

count(1)
450627


In [7]:
spark.sql("""
SELECT VendorID, AVG(fare_amount) AS AverageFare
FROM taxidb.greentaxis
GROUP BY VendorID
HAVING AVG(fare_amount) > 10
ORDER BY 2 DESC
LIMIT 5
""")

VendorID,AverageFare
,28.830541636814186
1.0,13.217967034800123
2.0,12.054380250700095


In [8]:
# Reading a Table with PySpark
df = spark.read.format("delta").table("taxidb.greentaxis")
print(f"Number of records: {df.count():,}")

Number of records: 450,627


In [9]:
res = (
    df.groupBy("VendorID")
    .agg(avg("fare_amount").alias("AverageFare"))
    .filter(col("AverageFare") > 10)
    .sort(col("AverageFare").desc())
)
res.take(5)

[Row(VendorID=None, AverageFare=28.830541636814186),
 Row(VendorID=1, AverageFare=13.217967034800125),
 Row(VendorID=2, AverageFare=12.054380250700097)]

In [15]:
sql("select count(1) from taxidb.YellowTaxis")

+--------+
|count(1)|
+--------+
|2       |
+--------+



In [13]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS taxidb.YellowTaxis (
    RideId INT,
    VendorId INT,
    PickupTime TIMESTAMP,
    DropTime TIMESTAMP,
    PickupLocationId INT,
    DropLocationId INT,
    CabNumber STRING,
    DriverLicenseNumber STRING,
    PassengerCount INT,
    TripDistance DOUBLE,
    RatecodeId INT,
    PaymentType INT,
    TotalAmount DOUBLE,
    FareAmount DOUBLE,
    Extra DOUBLE,
    MtaTax DOUBLE,
    TipAmount DOUBLE,
    TollsAmount DOUBLE,
    ImprovementSurcharge DOUBLE
) USING DELTA
LOCATION "{work_dir}/data/yellowTaxis.delta"
""")

In [14]:
spark.sql("""
INSERT INTO taxidb.yellowtaxis (RideId, VendorId, PickupTime, DropTime, PickupLocationId,
    DropLocationId, CabNumber, DriverLicenseNumber, PassengerCount, TripDistance,
    RatecodeId, PaymentType, TotalAmount, FareAmount, Extra, 
    MtaTax, TipAmount, TollsAmount, ImprovementSurcharge)
VALUES (9999995, 1, '2019-11-01T00:00:00.000Z', '2019-11-01T00:02:23.573Z', 65, 
    71, 'TAC304', '453987', 2, 4.5, 
    1, 1, 20.34, 15.0, 0.5, 
    0.4, 2.0, 2.0, 1.1)
""")
spark.sql("SELECT count(RideId) AS count FROM taxidb.YellowTaxis WHERE RideId = 9999995")

count
2


### Appending a DataFrame to a Table


In [17]:
# 테이블 생성 즉시 경로가 생김
spark.sql(f"""
CREATE TABLE IF NOT EXISTS taxidb.YellowTaxis_append (
    RideId INT,
    VendorId INT,
    PickupTime TIMESTAMP,
    DropTime TIMESTAMP,
    PickupLocationId INT,
    DropLocationId INT,
    CabNumber STRING,
    DriverLicenseNumber STRING,
    PassengerCount INT,
    TripDistance DOUBLE,
    RatecodeId INT,
    PaymentType INT,
    TotalAmount DOUBLE,
    FareAmount DOUBLE,
    Extra DOUBLE,
    MtaTax DOUBLE,
    TipAmount DOUBLE,
    TollsAmount DOUBLE,
    ImprovementSurcharge DOUBLE
) USING DELTA
LOCATION "{work_dir}/data/yellowTaxis_append.delta"
""")

spark.sql("""
INSERT INTO taxidb.yellowtaxis_append (RideId, VendorId, PickupTime, DropTime, PickupLocationId,
    DropLocationId, CabNumber, DriverLicenseNumber, PassengerCount, TripDistance,
    RatecodeId, PaymentType, TotalAmount, FareAmount, Extra, 
    MtaTax, TipAmount, TollsAmount, ImprovementSurcharge)
VALUES (9999995, 2, '2019-11-01T00:00:00.000Z', '2019-11-01T00:02:23.573Z', 65, 
    71, 'TAC304', '453987', 2, 4.5, 
    1, 1, 20.34, 15.0, 0.5, 
    0.4, 2.0, 2.0, 1.1)
""")
spark.sql("SELECT count(RideId) AS count FROM taxidb.yellowtaxis_append WHERE VendorId = 2")

count
1


In [18]:
df_for_append = spark.read.format("delta").table("taxidb.yellowtaxis_append")
df_for_append.printSchema()

root
 |-- RideId: integer (nullable = true)
 |-- VendorId: integer (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- CabNumber: string (nullable = true)
 |-- DriverLicenseNumber: string (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- RatecodeId: integer (nullable = true)
 |-- PaymentType: integer (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- FareAmount: double (nullable = true)
 |-- Extra: double (nullable = true)
 |-- MtaTax: double (nullable = true)
 |-- TipAmount: double (nullable = true)
 |-- TollsAmount: double (nullable = true)
 |-- ImprovementSurcharge: double (nullable = true)



In [22]:
df_for_append.write.mode("append").format("delta").save(f"{work_dir}/data/yellowTaxis.delta")
spark.read.format("delta").table("taxidb.yellowtaxis").count()

4

In [23]:
sql("select * from taxidb.yellowtaxis")

+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|RideId |VendorId|PickupTime         |DropTime               |PickupLocationId|DropLocationId|CabNumber|DriverLicenseNumber|PassengerCount|TripDistance|RatecodeId|PaymentType|TotalAmount|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|9999995|1       |2019-11-01 09:00:00|2019-11-01 09:02:23.573|65              |71            |TAC304   |453987             |2             |4.5         |1         |1          |20.34      |15.0      |0.5  |0.4   |2.0      |2.0        |1.

### Using the OverWrite Mode When Writing to a Delta Table

In [None]:
# 아래 코드는 데이터 다운로드가 필요하여 추후에 테스트 하기로 함

# Appending a DataFrame to a Table @ https://drive.google.com/file/d/1--SfboYb-KyEug4U89m2pnjq0MyUWbvJ/view?usp=sharing
yellowTaxi = spark.read.format("delta").table("taxidb.YellowTaxis")
yellowTaxiSchema = yellowTaxi.schema
print(yellowTaxiSchema)

df_for_append = (
    spark.read
    .option("header", True)
    .parquet("/mnt/datalake/book/data files/YellowTaxis_append.csv")
)

In [29]:
df = spark.read.parquet(f"{work_dir}/data/yellowTaxis.parquet")
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [30]:
df.show(3, truncate=False)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|1       |2021-01-01 09:30:10 |2021-01-01 09:36:12  |1.0            |2.1          |1.0       |N                 |142         |43          |2           |8.0        |3.0  |0.5    |0.0      

In [37]:
filtered = df.where(expr("VendorID = 1 and PULocationID = 142 and DOLocationID = 43"))

In [40]:
filtered.show(1, truncate=False)
filtered.groupBy("payment_type").count()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|1       |2021-01-01 09:30:10 |2021-01-01 09:36:12  |1.0            |2.1          |1.0       |N                 |142         |43          |2           |8.0        |3.0  |0.5    |0.0      

payment_type,count
4,3
0,19
3,8
2,143
1,604


In [41]:
filtered.where(expr("payment_type = 3")).show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2021-01-11 01:25:02|  2021-01-11 01:26:18|            1.0|          0.4|       1.0|                 N|         142|          43|           3|        3.5|  3.5|    0.5|       0.

In [42]:
withColumn = filtered.where(expr("payment_type = 3")).withColumn("payment_type", lit(30))
withColumn.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [43]:
withColumn.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2021-01-11 01:25:02|  2021-01-11 01:26:18|            1.0|          0.4|       1.0|                 N|         142|          43|          30|        3.5|  3.5|    0.5|       0.

In [None]:
# Inserting Data with the SQL COPY INTO Command
spark.sql("""
COPY INTO taxidb.yellowtaxis
FROM (
    SELECT RideId::Int
    , VendorId::Int
    , PickupTime::Timestamp
    , DropTime::Timestamp
    , PickupLocationId::Int
    , DropLocationId::Int
    , CabNumber::String
    , DriverLicenseNumber::String
    , PassengerCount::Int
    , TripDistance::Double
    , RateCodeId::Int
    , PaymentType::Int
    , TotalAmount::Double
    , FareAmount::Double
    , Extra::Double
    , MtaTax::Double
    , TipAmount::Double
    , TollsAmount::Double
    , ImprovementSurcharge::Double
) FROM '/mnt/datalake/book/DataFiles/YellowTaxisLargeAppend.csv'
FILEFORMAT = CSV
FORMAT_OPTIONS ("header" = "true")
""")

spark.sql("SELECT COUNT(*) FROM taxidb.YellowTaxis")

### Partitions
> These use cases lend themselves well to partitioning. Partitioning your data to align
with your query patterns can dramatically speed up query performance, especially
when combined with other performance optimizations, such as Z-ordering.7 A Delta
table partition is composed of a folder with a subset of data rows that share the same
value for one or more column(s).

In [None]:
# 파티션 구성된 델타 테이블을 사전에 생성해 두고
spark.sql(f"""
CREATE TABLE taxidb.YellowTaxisPartitioned
    RideId INT,
    VendorId INT,
    PickupTime TIMESTAMP,
    DropTime TIMESTAMP,
    PickupLocationId INT,
    DropLocationId INT,
    CabNumber STRING,
    DriverLicenseNumber STRING,
    PassengerCount INT,
    TripDistance DOUBLE,
    RatecodeId INT,
    PaymentType INT,
    TotalAmount DOUBLE,
    FareAmount DOUBLE,
    Extra DOUBLE,
    MtaTax DOUBLE,
    TipAmount DOUBLE,
    TollsAmount DOUBLE,
    ImprovementSurcharge DOUBLE
) USING DELTA
PARTITIONED BY(VendorId)
LOCATION "/mnt/datalake/book/chapter03/YellowTaxisDeltaPartitioned"
""")

In [None]:
# 기존에 생성되어 있는 관리 테이블을 읽어서
df = spark.read.format("delta").table("taxidb.YellowTaxis")
# 델타 테이블에 덮어쓰기를 한다
df.write.format("delta").mode("overwrite").save("/mnt/datalake/book/chapter03/YellowTaxisDeltaPartitioned")

In [None]:
spark.sql("SELECT DISTINCT(VendorId) FROM taxidb.YellowTaxisPartitioned")

In [None]:
# Partitioning by multiple columns -- Partition by VendorId AND rateCodeId
spark.sql(f"""
CREATE TABLE taxidb.YellowTaxisPartitioned
    RideId INT,
    VendorId INT,
    PickupTime TIMESTAMP,
    DropTime TIMESTAMP,
    PickupLocationId INT,
    DropLocationId INT,
    CabNumber STRING,
    DriverLicenseNumber STRING,
    PassengerCount INT,
    TripDistance DOUBLE,
    RatecodeId INT,
    PaymentType INT,
    TotalAmount DOUBLE,
    FareAmount DOUBLE,
    Extra DOUBLE,
    MtaTax DOUBLE,
    TipAmount DOUBLE,
    TollsAmount DOUBLE,
    ImprovementSurcharge DOUBLE
) USING DELTA
PARTITIONED BY(VendorId, RatecodeId)
LOCATION "/mnt/datalake/book/chapter03/YellowTaxisDeltaPartitioned"
""")

In [None]:
spark.sql("SELECT COUNT(*) > 0 AS `Partition exists` FROM taxidb.YellowTaxisPartitioned WHERE VendorId = 2 AND RateCodeId = 99")

### Selectively updating Delta partitions with replaceWhere

In [None]:
spark.sql("SELECT RideId, VendorId, PaymentType FROM taxidb.yellowtaxispartitioned WHERE VendorID = 1 AND RatecodeId = 99 LIMIT 5")

In [None]:
# Let’s assume that we have a business reason that states that all PaymentTypes for
# VendorId = 1 and RatecodeId = 9 should be 3. We can use the following PySpark
# expression with replaceWhere to achieve that result

# 임의의 조건에 해당하는 데이터 집합을 사전에 생성해 두고 (PaymentType 컬럼은 기 존재하는 컬럼)
from pyspark.sql.functions import *
businessCondition = (
    spark.read.format("delta")
    .load("/mnt/datalake/book/chapter03/YellowTaxisDeltaPartitioned")
    .where((col("VendorId") == 1) & (col("RatecodeId") == 99))
    .withColumn("PaymentType", lit(3))
)

# 해당 데이터프레임 기준으로 존재하는 컬럼에 교체하는 연산을 수행
replaceWhere = (
    businessCondition.write.format("delta")
    .option("replaceWhere", "VendorId = 1 AND RateCodeId = 99")
    .mode("overwrite")
)
replaceWhere.save("/mnt/datalake/book/chapter03/YellowTaxisDeltaPartitioned")

In [None]:
spark.sql("""
SELECT DISTINCT(PaymentType)
FROM taxidb.yellowtaxispartitioned
WHERE VendorID = 1 AND RatecodeId = 99
""")

### User-Defined Metadata
> Let’s look at the SparkSession configuration first. Assume that we have an INSERT
operation, to which we want to assign a GDPR tag for auditing purposes.

In [None]:
# Using SparkSession to Set Custom Metadata
spark.sql(f"""
SET spark.databricks.delta.commitInfo.userMetadata=my-custom-metadata= { "GDPR": "INSERT Request 1x965383" }
""")

# The GDPR tag will automatically be applied to the commit info in the transaction log.
spark.sql(f"""
INSERT INTO taxidb.yellowtaxisPartitioned (RideId, VendorId, PickupTime, DropTime,
    PickupLocationId, DropLocationId, CabNumber,
    DriverLicenseNumber, PassengerCount, TripDistance,
    RatecodeId, PaymentType, TotalAmount,
    FareAmount, Extra, MtaTax, TipAmount,
    TollsAmount, ImprovementSurcharge)
VALUES(10000000, 3, '2019-11-01T00:00:00.000Z',
    '2019-11-01T00:02:23.573Z', 65, 71, 'TAC304',
    '453987', 2, 4.5, 1, 1, 20.34, 15.0, 0.5,
    0.4, 2.0, 2.0, 1.1)
""")

In [None]:
# Using the DataFrameWriter to Set Custom Metadata
customMetadata = df.write.mode("append").format("delta").option("userMetadata", '{"PII": "Confidential XYZ"}')
customMetadata.save("/mnt/datalake/book/chapter03/YellowTaxisDeltaPartitioned")

### Conclusion
> 기본적인 연산 수행 시에, 다양한 방식(SQL DDL/DML, DataFrameWriter API, DeltaTable Builder API 등)으로 수행이 가능하며
테이블 생성 시에도 경로를 직접 명시하여 비관리 테이블로 사용할 수도 있고, 테이블 수준으로 관리 테이블로 활용이 가능하다
파티셔닝을 통해 관리 효율성 및 성능 향상을 기대할 수 있으며, replaceWhere 같은 구문을 통해서 특정 파티션 혹은 조건의 데이터를 효과적으로 업데이트 할 수 있다
또한 이용자 정의 메타데이터를 활용하여 특정 태그를 가진 statements 혹은 operations 목록을 확인할 수 있고, 또한 감사 혹은 규제를 목적으로 활용할 수도 있다 