In [1]:
import os
import glob
import json
from helpers.paths import PathMerger
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder
         .appName("VacuumApp")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config("spark.sql.session.timeZone", "UTC")
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Load Table

In [3]:
# Params
db, table = "devices", "device_models"
all_pks = ["id"]

# Init
pm = PathMerger(db, table)

# Create BRONZE
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")

# Even on Windows, Spark SQL requires a POSIX path with /-symbol as path separator.
abs_path = os.path.abspath(pm.bronze).replace("\\", "/")

# Register to Hive
spark.sql(f"""
CREATE TABLE {pm.hive}
  USING DELTA
  LOCATION '{abs_path}'
""")

DataFrame[]

In [4]:
spark.sql(f"""SELECT * FROM {pm.hive}""").show()

+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+
|      dms_timestamp| id|release_date|            name|color|description|            created|           modified|src_batch_id|
+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+
|2021-09-11 11:39:29|  2|  2010-05-15|Super Gadget 100|Black|update id 2|2010-03-21 12:00:02|2021-09-11 11:39:29|           0|
|2021-09-11 11:37:17|  6|  2021-12-31|Super Gadget 300| Pink| new device|2021-09-11 11:37:17|2021-09-11 11:37:17|           0|
|2021-09-11 11:37:47|  1|  2010-05-15|Super Gadget 100|  Red|update id 1|2010-03-21 12:00:01|2021-09-11 11:37:47|           0|
|2021-09-11 11:30:04|  4|  2018-05-13|Super Gadget 200|White|lorem ipsum|2018-03-20 12:01:01|2018-03-20 12:01:01|        null|
|2021-09-11 11:30:04|  3|  2010-11-01|Super Gadget 100| Pink|lorem ipsum|2010-08-05 07:00:00|2010-08-05 07:00:0

# Vacuum Configuration

By default, the VACUUM operation will delete files older than 168 hours. Setting the retain duration smaller than this will raise an exception unless we disable the `retentionDurationCheck` in configuration.

In [5]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

# Check what files we have

### According to the table itself

Notice that there is one roque file. It is not part of the `SELECT *` query. If you were to read the file directly, you would notice that is is empty.

In [6]:
# Use the Sparks function for getting filepaths of all rows.
files_in_use = spark.sql(f"SELECT DISTINCT input_file_name() as path FROM {pm.hive}").collect()

# Find the current working dir name
cwd = os.path.basename(os.getcwd())

files_currently_used = []

for fiu in files_in_use:
    # Split into subdirs
    fiu_parts = fiu.path.split("/")
    
    # Find the index of current working dir
    idx = fiu_parts.index(cwd)
    
    # Start at the first dir inside the cwd
    parquet_path = os.path.sep.join(fiu_parts[idx+1:])
    
    files_currently_used.append(parquet_path)

In [7]:
print("[INFO] Mark files that are NOT part of the query: ")

for f in glob.glob(pm.bronze + os.sep + "**/*.parquet", recursive=True):
    check = 'x' if f not in files_currently_used else ' '
    print(f"[{check}] {f}")

[INFO] Mark files that are NOT part of the query: 
[x] S3\bronze\abc\devices\device_models\part-00000-5a0ae9da-f510-4400-8377-9a1ed88cb259-c000.parquet
[x] S3\bronze\abc\devices\device_models\part-00000-d5cf549d-bdb9-4d07-9eb6-a84aeb5a4aaa-c000.snappy.parquet
[ ] S3\bronze\abc\devices\device_models\part-00045-2193ab89-242b-413d-811c-9bd3f6d19bdc-c000.snappy.parquet
[ ] S3\bronze\abc\devices\device_models\part-00069-78a281b9-e5c6-4d90-abf1-3a9fe31afb59-c000.snappy.parquet
[ ] S3\bronze\abc\devices\device_models\part-00107-513e811f-e296-4c9b-89f8-c204a3198269-c000.snappy.parquet
[ ] S3\bronze\abc\devices\device_models\part-00128-2d0c9196-a633-4efb-9645-758b76739c26-c000.snappy.parquet
[ ] S3\bronze\abc\devices\device_models\part-00140-8a607eac-4b5f-4b41-bab3-98e085012dac-c000.snappy.parquet


### According to the latest commit in delta log

Notice that the log lists also the empty file as a part of this delta table version.

In [8]:
# Find the Latest 
all_json_paths = [f for f in glob.glob(os.path.join(pm.bronze, '_delta_log') + os.sep + '*.json')]
latest_json_path = max(all_json_paths)

# Memoization
files_added_in_log = []
files_removed_in_log = [] 

print(f"[INFO] Opening: {os.path.basename(latest_json_path)}")

with open(latest_json_path, 'r') as f:
    json_rows = f.readlines()
    
    for j in json_rows:
        json_data = json.loads(j)
        
        if 'add' in json_data.keys():
            parquet_path = os.path.join(pm.bronze, json_data['add']['path'])
            files_added_in_log.append(parquet_path)
            
        if 'remove' in json_data.keys():
            parquet_path = os.path.join(pm.bronze, json_data['remove']['path'])
            files_removed_in_log.append(parquet_path)

[INFO] Opening: 00000000000000000001.json


In [9]:
print("[INFO] Mark files that were removed from delta table during commit: ")

for f in glob.glob(pm.bronze + os.sep + "**/*.parquet", recursive=True):
    check = 'x' if f in files_removed_in_log else ' '
    size = round(os.stat(f).st_size / 1024, 2)
    print(f"[{check}] {os.path.basename(f):<69} {size} KB")

[INFO] Mark files that were removed from delta table during commit: 
[x] part-00000-5a0ae9da-f510-4400-8377-9a1ed88cb259-c000.parquet          2.51 KB
[ ] part-00000-d5cf549d-bdb9-4d07-9eb6-a84aeb5a4aaa-c000.snappy.parquet   0.98 KB
[ ] part-00045-2193ab89-242b-413d-811c-9bd3f6d19bdc-c000.snappy.parquet   2.66 KB
[ ] part-00069-78a281b9-e5c6-4d90-abf1-3a9fe31afb59-c000.snappy.parquet   2.66 KB
[ ] part-00107-513e811f-e296-4c9b-89f8-c204a3198269-c000.snappy.parquet   2.62 KB
[ ] part-00128-2d0c9196-a633-4efb-9645-758b76739c26-c000.snappy.parquet   2.68 KB
[ ] part-00140-8a607eac-4b5f-4b41-bab3-98e085012dac-c000.snappy.parquet   2.63 KB


## Check what files VACUUM would delete

Note that we will perform a dry run. It will not delete any files; it just shows what would be deleted if `DRY RUN` statement would be removed.

In [10]:
dry_run_files = [x.path for x in spark.sql(f"VACUUM {pm.hive} RETAIN 24 HOURS DRY RUN").collect()]

In [11]:
print("[INFO] The files that would be deleted by VACUUM")
for f in dry_run_files:
    
    f_path = os.path.sep.join(f.split("/")[idx+1:])
    
    print(f_path)

[INFO] The files that would be deleted by VACUUM
S3\bronze\abc\devices\device_models\part-00000-5a0ae9da-f510-4400-8377-9a1ed88cb259-c000.parquet
