In [2]:
import polars as pl

delta_path = "./delta/user"

In [3]:
# Create Delta Table from Parquet File
pl.read_parquet("./data/users.parquet").write_delta("./delta/users", mode="overwrite")

In [4]:
df = pl.read_delta("./delta/users")
df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


# Mode Append - Append New Row to Delta Table

> Saves newly generated data

#### Traditional Method
1. Read all existing data.
2. Add the new data and rewrite the *entire* dataset.

→ Result: Original Parquet1: 5 ROWS, New Parquet2: 7 ROWS

#### Delta Lake Method
1. Read all existing data (only to know the current state, not to rewrite).
2. Create a *new* Parquet file containing *only* the new data.
3. Update the Delta Log.

→ Result: Original Parquet1: 5 ROWS, New Parquet2: 2 ROWS

In [5]:
df_new_row = pl.DataFrame({
    "name": ["Sumin", "Lee"],
    "user_id": [6, 7],
    "age": [29, 31],
    "score": [88.5, 92.0],
})

In [6]:
df_new_row.write_delta("./delta/users", mode="append")  # Append Mode

In [7]:
created_parquet = pl.read_parquet(
    "./delta/users/part-00000-d976ddfe-7619-4bd2-b72b-3e0ae9cc7db8-c000.snappy.parquet")

In [8]:
created_parquet

user_id,name,age,score
i64,str,i64,f64
6,"""Sumin""",29,88.5
7,"""Lee""",31,92.0


In [9]:
result_df = pl.read_delta("./delta/users")
result_df

user_id,name,age,score
i64,str,i64,f64
6,"""Sumin""",29,88.5
7,"""Lee""",31,92.0
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


# Mode Overwrite - Overwrite Delta Table with Filtered Data

> Overwrites all existing data.

This method can be used to delete or modify data when using a Copy-on-Write strategy (without Deletion Vectors).

#### How It Works
1. Read all existing data.
2. Create a new Apache Arrow (in memory) table with the filtered data (age > 30).
3. Overwrite the old data with the new Parquet file(s).

#### READ DELTA

```
BEFORE OVERWRITE
READ VERSION 0
READ VERSION 1 <-- append 2 rows

RESULT: 7 ROWS

---------------------------

AFTER OVERWRITE
SKIP VERSION 0
SKIP VERSION 1
READ VERSION 2 <-- overwrite with filtered rows

RESULT: ONLY ROWS WHERE age > 30
```

In [10]:
df = pl.read_delta("delta/users", version=0)
df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


In [11]:
filtered_df = df.filter(pl.col("age") > 30)
filtered_df

user_id,name,age,score
i64,str,i64,f64
2,"""Bob""",32,91.2
4,"""David""",41,85.9
5,"""Eve""",36,92.3


In [12]:
filtered_df.write_delta("delta/users", mode="overwrite")

In [13]:
# 새로운 Parquet 파일로 버저닝이 덮어씀
result_df = pl.read_delta("delta/users")
result_df

user_id,name,age,score
i64,str,i64,f64
2,"""Bob""",32,91.2
4,"""David""",41,85.9
5,"""Eve""",36,92.3


# Mode Merge - Overwrite Delta Table with Filtered Data

> This operation performs a merge between the source (df) and the target (files).

You might think this is the same operation as `overwrite`, but if the data is partitioned, `merge` has the advantage of being able to create a new version for *only* a specific partition.

ex.) If you have partitions `part_00000`, `part_00001`, and `part_00002`, you can create a new version for just the `part_00001` partition, while `part_00000` and `part_00002` continue to reference the existing files.

#### MERGE ACTIONS

```python
df.write_delta(
    "./delta/users", # target
    mode="merge",
    delta_merge_options={
        "predicate": "target.user_id = source.user_id",
        "source_alias": "source",
        "target_alias": "target"
    }
).action().execute()
```

**source**: New data (DataFrame)
**target**: Existing data (Delta Table)

- `when_not_matched_by_source_delete`: Deletes target rows that are not in the source.
- `when_not_matched_by_source_update`: Updates target rows that are not in the source.
- `when_matched_update_all`: Updates all matching rows (if this condition is omitted, matching rows may be passed over).
- `when_not_matched_insert_all`: Inserts rows from the source that do not match any target row based on the predicate.
- `when_matched_delete`: Deletes matching rows.

In [14]:
import polars as pl


def clean_df():
    """
    Clean Delta Table by Overwriting with Original Parquet Data
    :return:
    """
    pl.read_parquet("./data/users.parquet").write_delta("./delta/users", mode="overwrite")
    return pl.read_delta("./delta/users")

### ACTION - WHEN MATCHED UPDATE ALL

In [15]:
df = clean_df()

In [16]:
new_users_df = pl.DataFrame({
    "user_id": [1, 10, 4],
    "name": ["Alice2", "Bob2", "Charlie2"],
    "age": [30, 25, 40]
})

new_users_df

user_id,name,age
i64,str,i64
1,"""Alice2""",30
10,"""Bob2""",25
4,"""Charlie2""",40


In [17]:
df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


In [18]:
new_users_df.write_delta(
    "./delta/users",
    mode="merge",
    delta_merge_options={
        "predicate": "target.user_id = source.user_id",
        "source_alias": "source",
        "target_alias": "target"
    }
).when_matched_update_all().execute()  # 일치하는 값만 업데이트, 기존 테이블의 불일치하는 값들은 보존되지만 source에 없는 값들은 반영되지 않음

{'num_source_rows': 3,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 2,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 3,
 'num_output_rows': 5,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 46,
 'scan_time_ms': 13,
 'rewrite_time_ms': 0}

In [19]:
result_df = pl.read_delta("./delta/users")
result_df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice2""",30,87.5
4,"""Charlie2""",40,85.9
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
5,"""Eve""",36,92.3


### ACTION - WHEN NOT MATCHED INSERT

In [20]:
df = clean_df()

new_users_df = pl.DataFrame({
    "user_id": [1, 10, 4],
    "name": ["Alice2", "Bob2", "Charlie2"],
    "age": [30, 25, 40]
})

new_users_df


user_id,name,age
i64,str,i64
1,"""Alice2""",30
10,"""Bob2""",25
4,"""Charlie2""",40


In [21]:
df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


In [22]:
new_users_df.write_delta(
    "./delta/users",
    mode="merge",
    delta_merge_options={
        "predicate": "target.user_id = source.user_id",
        "source_alias": "source",
        "target_alias": "target"
    }
).when_not_matched_insert_all().execute()
# Inserts the non-matching value (user_id: 10), but since there is no update action, the names for (IDs 1 and 4) remain unchanged.

{'num_source_rows': 3,
 'num_target_rows_inserted': 1,
 'num_target_rows_updated': 0,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 1,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 0,
 'execution_time_ms': 7,
 'scan_time_ms': 1,
 'rewrite_time_ms': 0}

In [23]:
result_df = pl.read_delta("./delta/users")
result_df

user_id,name,age,score
i64,str,i64,f64
10,"""Bob2""",25,
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


### ACTION - WHEN NOT MATCHED BY SOURCE UPDATE

In [24]:
df = clean_df()

new_users_df = pl.DataFrame({
    "user_id": [1, 10, 4],
    "name": ["Alice2", "Bob2", "Charlie2"],
    "age": [30, 25, 40]
})

new_users_df

user_id,name,age
i64,str,i64
1,"""Alice2""",30
10,"""Bob2""",25
4,"""Charlie2""",40


In [25]:
df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


In [26]:
new_users_df.write_delta(
    "./delta/users",
    mode="merge",
    delta_merge_options={
        "predicate": "target.user_id = source.user_id",
        "source_alias": "source",
        "target_alias": "target"
    }
).when_not_matched_by_source_update(
    updates={
        "name": "'Unknown'",  # Must use single quotes for string literals
    }
).execute()  # Updates the name to 'Unknown' for users in the target whose user_id is not in the source (e.g., IDs 2, 3, 5).
# Because there is no 'when_not_matched_insert_all' action, user_id = 10 (from the source) is lost (i.e., not inserted).

{'num_source_rows': 3,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 3,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 2,
 'num_output_rows': 5,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 7,
 'scan_time_ms': 1,
 'rewrite_time_ms': 0}

In [27]:
result_df = pl.read_delta("./delta/users")
result_df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice""",25,87.5
4,"""David""",41,85.9
2,"""Unknown""",32,91.2
3,"""Unknown""",29,78.4
5,"""Unknown""",36,92.3


### Feature - Update data not in the source to UNKNOWN and UPSERT data that only exists in the source (Update names for 1 & 4, and add 10)

In [28]:
df = clean_df()

new_users_df = pl.DataFrame({
    "user_id": [1, 10, 4],
    "name": ["Alice2", "Bob2", "Charlie2"],
    "age": [30, 25, 40]
})

new_users_df

user_id,name,age
i64,str,i64
1,"""Alice2""",30
10,"""Bob2""",25
4,"""Charlie2""",40


In [29]:
df

user_id,name,age,score
i64,str,i64,f64
1,"""Alice""",25,87.5
2,"""Bob""",32,91.2
3,"""Charlie""",29,78.4
4,"""David""",41,85.9
5,"""Eve""",36,92.3


In [30]:
(new_users_df.write_delta(
    "./delta/users",
    mode="merge",
    delta_merge_options={
        "predicate": "target.user_id = source.user_id",
        "source_alias": "source",
        "target_alias": "target"
    }
).when_not_matched_by_source_update(
    updates={
        "name": "'Unknown'",
    }
).when_not_matched_insert_all()
 .when_matched_update_all().execute()
 )

{'num_source_rows': 3,
 'num_target_rows_inserted': 1,
 'num_target_rows_updated': 5,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 6,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 8,
 'scan_time_ms': 1,
 'rewrite_time_ms': 0}

In [31]:
result_df = pl.read_delta("./delta/users")
result_df

user_id,name,age,score
i64,str,i64,f64
10,"""Bob2""",25,
1,"""Alice2""",30,87.5
4,"""Charlie2""",40,85.9
2,"""Unknown""",32,91.2
3,"""Unknown""",29,78.4
5,"""Unknown""",36,92.3
