# Deleting Data from a Delta Table

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [2]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [4]:
def sql(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

def grep_sed_json(keyword, lineno, filename):
    !grep {keyword} {filename} | sed -n {lineno}p | python -m json.tool

In [5]:
sql("show databases ; use taxidb ; show tables")

+---------+
|namespace|
+---------+
|default  |
|taxidb   |
+---------+

++
||
++
++

+---------+------------------+-----------+
|namespace|tableName         |isTemporary|
+---------+------------------+-----------+
|taxidb   |greentaxis        |false      |
|taxidb   |yellowtaxis       |false      |
|taxidb   |yellowtaxis_append|false      |
+---------+------------------+-----------+



In [6]:
sql("select count(1) from greentaxis")

+--------+
|count(1)|
+--------+
|450625  |
+--------+



In [9]:
ls("/home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/*.json")

-rwxrwxrwx 1 jovyan 1000  2346 Aug 28 05:07 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000 15695 Aug 28 05:08 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000001.json
-rwxrwxrwx 1 jovyan 1000  4163 Aug 28 07:47 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000002.json


In [12]:
sql("describe history greentaxis ; describe greentaxis", 30)

+-------+-----------------------+------+--------+---------------------------------+------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation                        |operationParameters                                                                                                     |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                                    |userMetadata|engineInfo                         |


In [21]:
sql("""
SELECT
    INPUT_FILE_NAME() as filename_record_is_located,
    VendorId,
    RatecodeID,
    DOLocationID,
    passenger_count
FROM
    taxidb.greentaxis
WHERE
    VendorId = 2 and RatecodeID = 6
""")

+-------------------------------------------------------------------------------------------------------------------------------+--------+----------+------------+---------------+
|filename_record_is_located                                                                                                     |VendorId|RatecodeID|DOLocationID|passenger_count|
+-------------------------------------------------------------------------------------------------------------------------------+--------+----------+------------+---------------+
|file:/home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/part-00005-b7668c11-3775-4b26-8cce-6ca59b79a44c-c000.snappy.parquet|2       |6         |193         |1              |
|file:/home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/part-00006-cb453028-6940-422f-8622-596782a99a75-c000.snappy.parquet|2       |6         |55          |1              |
+--------------------------------------------------------------------------------------------------------

In [22]:
sql("""
UPDATE taxidb.greentaxis
SET passenger_count = 10
WHERE
    VendorId = 2 and RatecodeID = 6 and DOLocationID = 193
""")

++
||
++
++



In [25]:
ls("/home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/*.json")
cat("/home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000003.json")
grep_sed_json("add", 1, "/home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000003.json")

-rwxrwxrwx 1 jovyan 1000  2346 Aug 28 05:07 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000 15695 Aug 28 05:08 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000001.json
-rwxrwxrwx 1 jovyan 1000  4163 Aug 28 07:47 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000002.json
-rwxrwxrwx 1 jovyan 1000  2332 Aug 28 08:26 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000003.json
{"remove":{"path":"part-00005-b7668c11-3775-4b26-8cce-6ca59b79a44c-c000.snappy.parquet","deletionTimestamp":1724833589468,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":1075685}}
{"add":{"path":"part-00000-b43f38af-57c8-415c-ada4-c17fb8195df6-c000.snappy.parquet","partitionValues":{},"size":1076081,"modificationTime":1724833588815,"dataChange":true,"stats":"{\"numRecords\":46029,\"minValues\":{\"VendorId\":1,\"lpep_pickup_dat

In [23]:
sql("select count(1) from taxidb.greentaxis ; describe extended greentaxis ; describe history greentaxis", 50)

+--------+
|count(1)|
+--------+
|450627  |
+--------+

+----------------------------+----------------------------------------------------------------+---------------+
|col_name                    |data_type                                                       |comment        |
+----------------------------+----------------------------------------------------------------+---------------+
|VendorId                    |int                                                             |Ride Vendor    |
|lpep_pickup_datetime        |string                                                          |               |
|lpep_dropoff_datetime       |string                                                          |               |
|store_and_fwd_flag          |string                                                          |               |
|RatecodeID                  |int                                                             |Ref to RateCard|
|PULocationID                |int               

In [25]:
ls("/home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/*.json")

-rwxrwxrwx 1 jovyan 1000  2346 Aug 28 05:07 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000 15695 Aug 28 05:08 /home/jovyan/work/spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000001.json


### Performing the DELETE Operation

In [28]:
sql("""SELECT
RideId,
VendorId,
CabNumber,
TotalAmount
FROM
taxidb.YellowTaxis
WHERE
RideId = 9999995""", 50)

+-------+--------+---------+-----------+
|RideId |VendorId|CabNumber|TotalAmount|
+-------+--------+---------+-----------+
|9999995|1       |TAC304   |20.34      |
|9999995|1       |TAC304   |20.34      |
|9999995|2       |TAC304   |20.34      |
|9999995|2       |TAC304   |20.34      |
+-------+--------+---------+-----------+



In [34]:
sql("describe greentaxis")
sql("select VendorId, RatecodeID, count(1) from greentaxis group by VendorId, RatecodeID")

+---------------------+---------+---------------+
|col_name             |data_type|comment        |
+---------------------+---------+---------------+
|VendorId             |int      |Ride Vendor    |
|lpep_pickup_datetime |string   |               |
|lpep_dropoff_datetime|string   |               |
|store_and_fwd_flag   |string   |               |
|RatecodeID           |int      |Ref to RateCard|
|PULocationID         |int      |               |
|DOLocationID         |int      |               |
|passenger_count      |int      |               |
|trip_distance        |double   |               |
|fare_amount          |double   |               |
|extra                |double   |               |
|mta_tax              |double   |               |
|tip_amount           |double   |               |
|tolls_amount         |double   |               |
|ehail_fee            |string   |               |
|improvement_surcharge|double   |               |
|total_amount         |double   |               |


In [37]:
sql("select * from greentaxis where vendorid = 1 and ratecodeid = 6 ; describe history greentaxis")

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|1       |2019-12-12 14:01:31 |2019-12-12 14:14:25  |N                 |6         |243         |244         |1              |1.4          |3.5        |0.0  |0.5   

In [38]:
sql("delete from greentaxis where vendorid = 1 and ratecodeid = 6 ; describe history greentaxis")

++
||
++
++

+-------+-----------------------+------+--------+---------------------------------+------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation                        |operationParameters                                                                                                     |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                                    |userMetadata|engineInfo              

In [43]:
# 마지막으로 업데이트 된 2개 파일이 신규로 생성된 파일 블록
ls("./spark-warehouse/taxidb.db/greentaxis/*.parquet")

-rwxrwxrwx 1 jovyan 1000 1013050 Aug 28 07:47 ./spark-warehouse/taxidb.db/greentaxis/part-00000-5ea81464-7c10-4158-8a76-48e0df3b1723-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  930701 Aug 28 05:08 ./spark-warehouse/taxidb.db/greentaxis/part-00000-f7d57c66-f63d-4b6f-b173-431f4b70c855-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000 1074475 Aug 28 05:08 ./spark-warehouse/taxidb.db/greentaxis/part-00001-0fd55311-d8bc-486d-9b83-8ece15716a3a-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  917751 Aug 28 07:47 ./spark-warehouse/taxidb.db/greentaxis/part-00001-457f8738-0fe5-47c6-88fa-d4eb8d67a36a-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000 1013002 Aug 28 05:08 ./spark-warehouse/taxidb.db/greentaxis/part-00002-98ee1bf8-630d-48c1-accc-737ae1f617fc-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000 1068779 Aug 28 05:08 ./spark-warehouse/taxidb.db/greentaxis/part-00003-921bdcd3-8744-42b2-93de-d4cb31a5f600-c000.snappy.parquet
-rwxrwxrwx 1 jovyan 1000  934450 Aug 28 05:08 ./spark-warehouse/taxidb.db/greentax

In [46]:
ls("./spark-warehouse/taxidb.db/greentaxis/_delta_log/*.json")

-rwxrwxrwx 1 jovyan 1000  2346 Aug 28 05:07 ./spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000 15695 Aug 28 05:08 ./spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000001.json
-rwxrwxrwx 1 jovyan 1000  4163 Aug 28 07:47 ./spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000002.json


In [50]:
# 2회의 add 와 remove action 이 발생함을 알 수 있다
cat("./spark-warehouse/taxidb.db/greentaxis/_delta_log/00000000000000000002.json")

{"remove":{"path":"part-00002-98ee1bf8-630d-48c1-accc-737ae1f617fc-c000.snappy.parquet","deletionTimestamp":1724831229056,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":1013002}}
{"remove":{"path":"part-00007-619c7fd9-f812-4882-a16c-360c22f70798-c000.snappy.parquet","deletionTimestamp":1724831229056,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":917764}}
{"add":{"path":"part-00000-5ea81464-7c10-4158-8a76-48e0df3b1723-c000.snappy.parquet","partitionValues":{},"size":1013050,"modificationTime":1724831228405,"dataChange":true,"stats":"{\"numRecords\":46035,\"minValues\":{\"VendorId\":1,\"lpep_pickup_datetime\":\"2008-12-31 22:34:56\",\"lpep_dropoff_datetime\":\"2008-12-31 22:42:10\",\"store_and_fwd_flag\":\"N\",\"RatecodeID\":1,\"PULocationID\":1,\"DOLocationID\":1,\"passenger_count\":0,\"trip_distance\":-10.75,\"fare_amount\":-60.0,\"extra\":-1.0,\"mta_tax\":-0.5,\"tip_amount\":-90.5,\"tolls_amount\":0.0,\"improvement_surcharge\":-0.3