In [1]:
import findspark
findspark.init()

import os
print(os.environ['JAVA_HOME'])
print(os.environ['SPARK_HOME'])

/usr/lib/jvm/java-11-openjdk-amd64
/usr/local/spark


In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [3]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark.conf.set("spark.sql.decimalOperations.allowPrecisionLoss", "true")
spark

In [5]:
def show(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def sql(query):
    return spark.sql(query)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

def grep_sed_json(keyword, lineno, filename):
    !grep {keyword} {filename} | sed -n {lineno}p | python -m json.tool

In [11]:
duplicated_rate_card = spark.read.csv("data/duplicatedRateCard.csv")
duplicated_rate_card.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



In [26]:
dup = duplicated_rate_card.where(expr("_c0 < 7"))
not_dup = duplicated_rate_card.where(expr("_c0 >= 7"))

In [27]:
not_dup.write.mode("overwrite").partitionBy("_c0").saveAsTable("default.rate_card_v1")

In [28]:
spark.sql("select * from default.rate_card_v1").show()
spark.sql("show tables")

+-----+---+
|  _c1|_c0|
+-----+---+
|Seoul|  7|
|Pusan|  8|
+-----+---+



namespace,tableName,isTemporary
default,rate_card_v1,False
default,users,False
default,users_cluster_by,False


In [29]:
dup.distinct().write.mode("append").partitionBy("_c0").saveAsTable("default.rate_card_v1")

In [30]:
spark.sql("select * from default.rate_card_v1").show()
spark.sql("show tables")

+--------------------+---+
|                 _c1|_c0|
+--------------------+---+
|Nassau or Westche...|  4|
|     Negotiated fare|  5|
|       Standard Rate|  1|
|          Group ride|  6|
|              Newark|  3|
|               Seoul|  7|
|               Pusan|  8|
|                 JFK|  2|
+--------------------+---+



namespace,tableName,isTemporary
default,rate_card_v1,False
default,users,False
default,users_cluster_by,False


In [31]:
spark.sql("select * from default.rate_card_v1 order by _c0 asc")

_c1,_c0
Standard Rate,1
JFK,2
Newark,3
Nassau or Westchester,4
Negotiated fare,5
Group ride,6
Seoul,7
Pusan,8


In [5]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType

def dropAndRemoveTable(dbName, tableName):
    location="/home/jovyan/work/spark-warehouse/{}".format(tableName)
    !rm -rf {location}
    query("DROP TABLE IF EXISTS {}.{}".format(dbName, tableName))

    
def createFamilyMembers(dbName, tableName):
    # 예제 데이터 생성 및 히스토리 테스트
    tableSchema = StructType([
        StructField("id", IntegerType(), True),
        StructField("firstName", StringType(), True),
        StructField("middleName", StringType(), True),
        StructField("lastName", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("birthDate", StringType(), True),
        StructField("ssn", StringType(), True),
        StructField("salary", IntegerType(), True)
    ])

    tableRows = []
    tableRows.append(Row(1, "suhyuk", "psyoblade", "park", "male", "2000/10/30", "741030", 1000))
    tableRows.append(Row(2, "youngmi", "kiki", "kim", "female", "2004/08/08", "770808", 2000))
    tableRows.append(Row(3, "sowon", "eva", "park", "female", "2005/05/20", "040520", 3000))
    tableRows.append(Row(4, "sihun", "sean", "park", "male", "2006/01/14", "080114", 4000))

    df = spark.createDataFrame(tableRows, tableSchema)
    df.write.format("delta").mode("overwrite").partitionBy("salary").saveAsTable("{}.{}".format(dbName, tableName))


dropAndRemoveTable("default", "users")
createFamilyMembers("default", "users")

In [6]:
query("select * from default.users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000
4,sihun,sean,park,male,2006/01/14,80114,4000


In [7]:
def createNewFamilyMembers(dbName, tableName):
    # 예제 데이터 생성 및 히스토리 테스트
    nameSchema = StructType([
        StructField("firstName", StringType(), True),
        StructField("middleName", StringType(), True),
        StructField("lastName", StringType(), True)
    ])
    tableSchema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", nameSchema, True),
        StructField("gender", StringType(), True),
        StructField("birthDate", StringType(), True),
        StructField("ssn", StringType(), True),
        StructField("salary", IntegerType(), True)
    ])

    tableRows = []
    tableRows.append(Row(1, ("suhyuk", "psyoblade", "park"), "male", "2000/10/30", "741030", 1000))
    tableRows.append(Row(2, ("youngmi", "kiki", "kim"), "female", "2004/08/08", "770808", 2000))
    tableRows.append(Row(3, ("sowon", "eva", "park"), "female", "2005/05/20", "040520", 3000))
    tableRows.append(Row(4, ("sihun", "sean", "park"), "male", "2006/01/14", "080114", 4000))

    df = spark.createDataFrame(tableRows, tableSchema)
    df.write.format("delta").mode("overwrite").partitionBy("salary").saveAsTable("{}.{}".format(dbName, tableName))


dropAndRemoveTable("default", "family")
createNewFamilyMembers("default", "family")

In [8]:
query("select name.firstName, name.lastName from default.family")

firstName,lastName
suhyuk,park
youngmi,kim
sowon,park
sihun,park


## Update table schema
> 구문 중에 표준 `Spark SQL` 구문이 아닌 `Delta lake` 구문의 경우는 동작하지 않기 때문에 `--` 와 같이 `SQL` 코멘트로 표현했습니


### Add columns
```sql
-- 컬럼 추가는 잘 동작함
ALTER TABLE table_name ADD COLUMNS (col_name data_type [COMMENT col_comment] [FIRST|AFTER colA_name], ...)
ALTER TABLE table_name ADD COLUMNS (col_name.nested_col_name data_type [COMMENT col_comment] [FIRST|AFTER colA_name], ...)
```

In [8]:
# ALTER TABLE table_name ADD COLUMNS (col_name data_type [COMMENT col_comment] [FIRST|AFTER colA_name], ...)
query("""
alter table users add columns (description string comment 'description_of_member')
""")

In [9]:
query("describe users")

col_name,data_type,comment
id,int,
firstName,string,
middleName,string,
lastName,string,
gender,string,
birthDate,string,
ssn,string,
salary,int,
description,string,description_of_member
,,


In [10]:
# ALTER TABLE table_name ADD COLUMNS (col_name.nested_col_name data_type [COMMENT col_comment] [FIRST|AFTER colA_name], ...)
query("""
alter table family add columns (name.description string comment 'description_of_name')
""")

In [11]:
query("describe family")

col_name,data_type,comment
id,int,
name,"struct<firstName:string,middleName:string,lastName:string,description:string>",
gender,string,
birthDate,string,
ssn,string,
salary,int,
,,
# Partitioning,,
Part 0,salary,


In [12]:
# https://spark.apache.org/docs/latest/sql-ref-syntax-aux-describe-table.html
query("describe extended family default.family.name")

info_name,info_value
col_name,name
data_type,"struct<firstName:string,middleName:string,lastName:string,description:string>"
comment,


In [24]:
query("show tables").show()
query("desc extended student").show(truncate=False)
query("desc extended student_delta").show(truncate=False)

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|       family|      false|
|  default|      student|      false|
|  default|student_delta|      false|
|  default|        users|      false|
+---------+-------------+-----------+

+----------------------------+----------------------------------------------------------+-------+
|col_name                    |data_type                                                 |comment|
+----------------------------+----------------------------------------------------------+-------+
|id                          |int                                                       |null   |
|name                        |string                                                    |null   |
|age                         |int                                                       |null   |
|                            |                                                          |       |
|# Detail

### Change column comment or ordering
```sql
-- 컬럼 타입 변경구분 경우에도 여러 컬럼 드랍이 필요하여 수행이 안 되는 것으로 추정되며 동작하지 않음
-- ALTER TABLE table_name ALTER [COLUMN] col_name col_name data_type [COMMENT col_comment] [FIRST|AFTER colA_name]
-- ALTER TABLE table_name ALTER [COLUMN] col_name.nested_col_name nested_col_name data_type [COMMENT col_comment] [FIRST|AFTER colA_name]
````

### Replace columns
> [Support Spark’s column drop and rename commands #732](https://github.com/delta-io/delta/issues/732) 에 따르면 현재 스파크가 지원하는 컬럼 드랍을 델타 SQL 구문에서 지원하지 않음
```sql
-- 여러 컬럼을 한 번에 드랍하는 기능이 델타 SQL 구문에서 동작하지 않음
ALTER TABLE table_name REPLACE COLUMNS (col_name1 col_type1 [COMMENT col_comment1], ...)
```


In [26]:
# 일반 테이블의 경우 AnalysisException: REPLACE COLUMNS is only supported with v2 tables. 와 같은 오류 발생
query("alter table student replace columns (user_id int, user_name string, user_age long)")

AnalysisException: REPLACE COLUMNS is only supported with v2 tables.

In [27]:
# 델타 테이블의 경우 AnalysisException: DROP COLUMN is not supported for your Delta table. 와 같은 오류 발생
query("alter table student_delta replace columns (user_id int, user_name string, user_age long)")

"""
AnalysisException: DROP COLUMN is not supported for your Delta table. 
Please upgrade your Delta table to reader version 2 and writer version 5
 and change the column mapping mode to 'name' mapping. You can use the following command:

ALTER TABLE <table_name> SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')
"""

AnalysisException: DROP COLUMN is not supported for your Delta table. 
Please upgrade your Delta table to reader version 2 and writer version 5
 and change the column mapping mode to 'name' mapping. You can use the following command:

 ALTER TABLE <table_name> SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')

    

In [30]:
# 아래와 같이 버전을 변경하면 AnalysisException: Cannot drop column from a struct type with a single field: StructType(StructField(age,IntegerType,true)) 오류 발생
query("""
ALTER TABLE student_delta SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')
""")

query("alter table student_delta replace columns (user_id int, user_name string, user_age long)")

AnalysisException: Cannot drop column from a struct type with a single field: StructType(StructField(age,IntegerType,true))

### Rename columns
```sql
-- 일반 컬럼은 낮은 버전에서도 사용 가능 (minReaderVersion=1, minWriterVersion=2)
ALTER TABLE table_name RENAME COLUMN old_col_name TO new_col_name

-- 중첩된 컬럼은 델타레이크 버전을 올려야 사용가능
ALTER TABLE <table_name> SET TBLPROPERTIES ('delta.columnMapping.mode' = 'name', 'delta.minReaderVersion' = '2', 'delta.minWriterVersion' = '5')
ALTER TABLE table_name RENAME COLUMN col_name.old_nested_field TO new_nested_field
```

In [37]:
query("ALTER TABLE student_delta RENAME COLUMN name TO user_name")

In [39]:
# 기본 버전(minRead:1, minWrite:2) 버전에서 이름 변경 시에 AnalysisException: Column rename is not supported for your Delta table. 오류 발생
query("ALTER TABLE family RENAME COLUMN name.middleName TO midName")

"""Please upgrade your Delta table to reader version 2 and writer version 5
 and change the column mapping mode to 'name' mapping. You can use the following command:

 ALTER TABLE <table_name> SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')"""

AnalysisException: Column rename is not supported for your Delta table. 
Please upgrade your Delta table to reader version 2 and writer version 5
 and change the column mapping mode to 'name' mapping. You can use the following command:

 ALTER TABLE <table_name> SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')

    

In [41]:
query("""
ALTER TABLE family SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')
""")
query("ALTER TABLE family RENAME COLUMN name.middleName TO midName")

### Drop columns
```sql
ALTER TABLE table_name DROP COLUMN col_name
-- 하나의 컬럼은 드랍 되지만, 2개 이상 컬럼 드랍을 지원하지 않음
-- ALTER TABLE table_name DROP COLUMNS (col_name_1, col_name_2)
```

In [35]:
query("alter table student_delta drop column age")

In [36]:
query("alter table student_delta drop columns (id, name)")

AnalysisException: Cannot drop column from a struct type with a single field: StructType(StructField(name,StringType,true))

### Change column type or name
```python
# 컬럼 타입의 변경은 API 활용하여 withColumn 및 cast 함수를 활용해야만 합니다
spark.read.table(...) \
  .withColumn("birthDate", col("birthDate").cast("date")) \
  .write \
  .format("delta") \
  .mode("overwrite")
  .option("overwriteSchema", "true") \
  .saveAsTable(...)
```

In [42]:
query("describe extended users")

col_name,data_type,comment
id,int,
firstName,string,
middleName,string,
lastName,string,
gender,string,
birthDate,string,
ssn,string,
salary,int,
description,string,description_of_member
,,


In [54]:
from delta.tables import *

tableName = "users"
deltaUsers = DeltaTable.forName(spark, tableName).toDF()
deltaUsers.printSchema()
users = deltaUsers.withColumn("uid", expr("cast(id as long)")).drop("id").withColumnRenamed("uid", "id")
users.printSchema()

root
 |-- id: integer (nullable = true)
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthDate: string (nullable = true)
 |-- ssn: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- description: string (nullable = true)

root
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthDate: string (nullable = true)
 |-- ssn: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- id: long (nullable = true)



### 스파크 테이블 관리
> 결국 스파크에서 어떻게 테이블을 관리할 지는 생성 시점의 `USING` 구문의 데이터소스의 종류에 의해 결정나며, 기본 데이터소스는 `Parquet` 입니다.


### 1. 데이터소스 지정을 통해 생성하는 방법
> `USING` 구문이 필수이며, 하이브가 아니라 스파크 독립적인 메타정보를 로컬 환경의 `derby` 엔진을 통해 `metastore` 정보를 관리하는 방식
```sql
CREATE TABLE [ IF NOT EXISTS ] table_identifier
    [ ( col_name1 col_type1 [ COMMENT col_comment1 ], ... ) ]
    USING data_source
    [ OPTIONS ( key1=val1, key2=val2, ... ) ]
    [ PARTITIONED BY ( col_name1, col_name2, ... ) ]
    [ CLUSTERED BY ( col_name3, col_name4, ... ) 
        [ SORTED BY ( col_name [ ASC | DESC ], ... ) ] 
        INTO num_buckets BUCKETS ]
    [ LOCATION path ]
    [ COMMENT table_comment ]
    [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ]
    [ AS select_statement ]
```

### 2. 하이브 DDL 구문을 통해 생성하는 방법
> 설치 및 기동 시에 hive 관련 정보를 로딩하거나 `SparkSession` 객체생성 시에 `config("spark.sql.catalogImplementation", "hive")` 설정을 하여 `Hive DDL/DML` 구문으로 인식하게 하는 방법
```sql
CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier
    [ ( col_name1[:] col_type1 [ COMMENT col_comment1 ], ... ) ]
    [ COMMENT table_comment ]
    [ PARTITIONED BY ( col_name2[:] col_type2 [ COMMENT col_comment2 ], ... ) 
        | ( col_name1, col_name2, ... ) ]
    [ CLUSTERED BY ( col_name1, col_name2, ...) 
        [ SORTED BY ( col_name1 [ ASC | DESC ], col_name2 [ ASC | DESC ], ... ) ] 
        INTO num_buckets BUCKETS ]
    [ ROW FORMAT row_format ]
    [ STORED AS file_format ]
    [ LOCATION path ]
    [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ]
    [ AS select_statement ]
```

### 3. 이미 존재하는 테이블/뷰의 정의/메타를 활용하여 생성하는 방법
> `LIKE` 구문과 `USING` 구문을 통해 지정한 데이터소스의 메타데이터 및 정의를 통해 테이블을 생성하는 방법
```sql
CREATE TABLE [IF NOT EXISTS] table_identifier LIKE source_table_identifier
    USING data_source
    [ ROW FORMAT row_format ]
    [ STORED AS file_format ]
    [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ]
    [ LOCATION path ]
```

In [62]:
query("show columns in student")

col_name
id
name
age


In [69]:
query("describe extended users").show()
query("describe detail users").show()
query("describe history users").show()

+--------------------+--------------------+--------------------+
|            col_name|           data_type|             comment|
+--------------------+--------------------+--------------------+
|                  id|                 int|                    |
|           firstName|              string|                    |
|          middleName|              string|                    |
|            lastName|              string|                    |
|              gender|              string|                    |
|           birthDate|              string|                    |
|                 ssn|              string|                    |
|              salary|                 int|                    |
|         description|              string|description_of_me...|
|                    |                    |                    |
|      # Partitioning|                    |                    |
|              Part 0|              salary|                    |
|                    |   

In [17]:
data = spark.range(0,5)
data.write.format("delta").mode("overwrite").save("./foo")

In [3]:
data.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [4]:
createTable="""
CREATE OR REPLACE TABLE default.people_using_sql (
 id INT,
 firstName STRING,
 middleName STRING,
 lastName STRING,
 gender STRING,
 birthDate TIMESTAMP,
 ssn STRING,
 salary INT
)
USING DELTA
      PARTITIONED BY (gender)
"""
print(createTable)


CREATE OR REPLACE TABLE default.people_using_sql (
 id INT,
 firstName STRING,
 middleName STRING,
 lastName STRING,
 gender STRING,
 birthDate TIMESTAMP,
 ssn STRING,
 salary INT
)
USING DELTA
      PARTITIONED BY (gender)



In [5]:
query(createTable)

In [6]:
query("show tables")

namespace,tableName,isTemporary
default,people_using_sql,False


In [12]:
!rm -r /home/jovyan/work/spark-warehouse/foo

In [13]:
!pwd

/home/jovyan/work


In [14]:
createTable="""
CREATE TABLE IF NOT EXISTS delta.`/home/jovyan/work/spark-warehouse/foo` (
 id INT,
 firstName STRING,
 middleName STRING,
 lastName STRING,
 gender STRING,
 birthDate TIMESTAMP,
 ssn STRING,
 salary INT
)
USING DELTA
PARTITIONED BY (gender)
"""
query(createTable)

In [15]:
query("show tables")

namespace,tableName,isTemporary
default,people_using_sql,False


In [16]:
query("CREATE TABLE `default.bad_style` (id INT) USING DELTA")

AnalysisException: `default.bad_style` is not a valid name for tables/databases. Valid names only contain alphabet characters, numbers and _.

In [17]:
query("CREATE TABLE `default`.`good_style` (id INT) USING DELTA")

In [18]:
query("show tables")

namespace,tableName,isTemporary
default,good_style,False
default,people_using_sql,False


In [24]:
builder = (
    DeltaTable.create(spark)
  .tableName("default.events")
  .addColumn("eventId", "BIGINT")
  .addColumn("data", "STRING")
  .addColumn("eventType", "STRING")
  .addColumn("eventTime", "TIMESTAMP")
)

In [25]:
dateTableBuilder = (
    builder.addColumn("eventDate", "DATE", generatedAlwaysAs="CAST(eventTime AS DATE)")
    .partitionedBy("eventType", "eventDate")
)
dateTableBuilder.execute()

<delta.tables.DeltaTable at 0x7f98002af580>

In [27]:
query("desc extended events")

col_name,data_type,comment
eventId,bigint,
data,string,
eventType,string,
eventTime,timestamp,
eventDate,date,
,,
# Partitioning,,
Part 0,eventType,
Part 1,eventDate,
,,


In [32]:
query('SELECT * FROM default.events WHERE eventTime >= "2020-10-01 00:00:00" and eventTime <= "2020-10-01 12:00:00"').explain("extended")

== Parsed Logical Plan ==
'Project [*]
+- 'Filter (('eventTime >= 2020-10-01 00:00:00) AND ('eventTime <= 2020-10-01 12:00:00))
   +- 'UnresolvedRelation [default, events], [], false

== Analyzed Logical Plan ==
eventId: bigint, data: string, eventType: string, eventTime: timestamp, eventDate: date
Project [eventId#4064L, data#4065, eventType#4066, eventTime#4067, eventDate#4068]
+- Filter ((eventTime#4067 >= cast(2020-10-01 00:00:00 as timestamp)) AND (eventTime#4067 <= cast(2020-10-01 12:00:00 as timestamp)))
   +- SubqueryAlias spark_catalog.default.events
      +- Relation default.events[eventId#4064L,data#4065,eventType#4066,eventTime#4067,eventDate#4068] parquet

== Optimized Logical Plan ==
Filter ((((eventDate#4068 >= cast(2020-10-01 00:00:00 as date)) OR isnull((eventDate#4068 >= cast(2020-10-01 00:00:00 as date)))) AND ((eventDate#4068 <= cast(2020-10-01 12:00:00 as date)) OR isnull((eventDate#4068 <= cast(2020-10-01 12:00:00 as date))))) AND ((isnotnull(eventTime#4067) AND (

In [33]:
query("show tables")

namespace,tableName,isTemporary
default,events,False
default,good_style,False
default,people_using_sql,False


In [34]:
query("desc formatted events")

col_name,data_type,comment
eventId,bigint,
data,string,
eventType,string,
eventTime,timestamp,
eventDate,date,
,,
# Partitioning,,
Part 0,eventType,
Part 1,eventDate,
,,


In [37]:
query("show tblproperties events ('Type')")

key,value
Type,MANAGED


In [38]:
query("describe history events")

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2022-12-29 12:18:56,,,CREATE TABLE,"{isManaged -> true, description -> null, partitionBy -> [""eventType"",""eventDate""], properties -> {}}",,,,,Serializable,True,{},,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [41]:
query("select * from events VERSION AS of 0")

eventId,data,eventType,eventTime,eventDate


In [6]:
print("{}.{}".format("a", "b"))

a.b


In [30]:
query("describe history users")

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2023-01-09 08:58:34,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""salary""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 4, numOutputRows -> 4, numOutputBytes -> 7980}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [31]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000
4,sihun,sean,park,male,2006/01/14,80114,4000


In [66]:

appendRows = []
appendRows.append(Row(3, "sowon", "eva", "park", "female", "2005/05/20", "040520", 3000))
df1 = spark.createDataFrame(appendRows, tableSchema)
df1.write.format("delta").mode("append").saveAsTable("default.users")


In [67]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [68]:

appendRows = []
appendRows.append(Row(4, "sihun", "sean", "park", "male", "2006/01/14", "080114", 2000))
df2 = spark.createDataFrame(tableRows, tableSchema)
df2.write.format("delta").mode("append").saveAsTable("default.users")


In [70]:
# query("SELECT * FROM default.users TIMESTAMP AS OF '2018-10-18T22:15:12.013Z'")

In [71]:
history = query("DESCRIBE HISTORY users")
latest_version = history.selectExpr("max(version)").collect()
latest_version

[Row(max(version)=3)]

In [72]:
users = spark.read.format("delta").option("versionAsOf", latest_version[0][0]).load("./spark-warehouse/users")
users.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+----------+------+------+



In [73]:
query("SELECT CAST(date_sub(current_date(), 1) AS STRING)").collect()[0][0]

'2022-12-31'

In [74]:
users = spark.read.format("delta").option("versionAsOf", 1).load("./spark-warehouse/users")
users.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [75]:
users = spark.read.format("delta").option("versionAsOf", 0).load("./spark-warehouse/users")
users.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [77]:
query("DESCRIBE HISTORY users")

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2023-01-01 11:59:22,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,2.0,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2023-01-01 11:59:16,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,Serializable,True,"{numFiles -> 2, numOutputRows -> 1, numOutputBytes -> 3130}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2023-01-01 11:59:00,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,0.0,Serializable,False,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2022-12-30 11:42:08,,,CREATE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [46]:
yesterday = query("SELECT CAST(date_add(current_date(), 0) AS STRING)").collect()[0][0]
yesterday

'2022-12-31'

In [50]:
# 원하는 데이터 버전과 버전 사이의 시간을 알고 있다면 조회가 가능하다
df = spark.read.format("delta").option("timestampAsOf", "2022-12-31 14:40:00").load("./spark-warehouse/users")
df.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [52]:
# 원하는 데이터 버전과 버전 사이의 시간을 알고 있다면 조회가 가능하다
df2 = spark.read.format("delta").option("timestampAsOf", "2022-12-31 14:46:00").load("./spark-warehouse/users")
df2.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [54]:
# 마지막 버전의 타임스템프보다 이후의 시간입력 시에는 오류가 발생, 이전 데이터는 사이의 시간을 입력해야만 한다
df3 = spark.read.format("delta").option("timestampAsOf", "2022-12-31 14:54:59").load("./spark-warehouse/users")
df3.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [56]:
# 최신 데이터는 조건 없이 조회하면 된다
latest_data = spark.read.format("delta").load("./spark-warehouse/users")
latest_data.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+----------+------+------+



In [57]:
# 초반에 넣었던 오류 데이터를 제외하고 신규 데이터만 가져오는 작업을 하려면 어떻게 해야할까?
initial_data = spark.read.format("delta").option("timestampAsOf", "2022-12-31 14:40:00").load("./spark-warehouse/users")
initial_data.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [58]:
latest_added = latest_data.where("id > 2")
latest_added.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+----------+------+------+



In [60]:
initial_data.union(latest_added).show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+----------+------+------+



In [62]:
# 초반에 넣었던 오류 데이터를 제외하고 신규 데이터만 가져오는 작업을 하려면 어떻게 해야할까?

from delta.tables import *

# 실수로 2번 이하의 이용자를 모두 삭제하고
deltaTable = DeltaTable.forPath(spark, "./spark-warehouse/users")
deltaTable.delete("id < 3")

In [63]:
# 초기에 추가된 이용자 2명을 다시 읽어와서
initial_data = spark.read.format("delta").option("timestampAsOf", "2022-12-31 14:40:00").load("./spark-warehouse/users")
initial_data.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [64]:
latest_data = spark.read.format("delta").load("./spark-warehouse/users")
latest_data.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+----------+------+------+



In [66]:
# 초기에 추가된 이용자 2명을 다시 추가하자
initial_data.write.format("delta").mode("append").saveAsTable("users")

NameError: name 'sparks' is not defined

In [85]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [68]:
query("show tables")

namespace,tableName,isTemporary
default,users,False


In [None]:
createTable="""
CREATE OR REPLACE TABLE default.people_using_sql (
 id INT,
 firstName STRING,
 middleName STRING,
 lastName STRING,
 gender STRING,
 birthDate TIMESTAMP,
 ssn STRING,
 salary INT
)
USING DELTA
      PARTITIONED BY (gender)
"""
print(createTable)

In [73]:
query("CREATE OR REPLACE TABLE default.users_clone USING DELTA AS SELECT * FROM users")

# 아래의 구문은 하이브 메타스토어를 통해 생성할 수 있는 구문
# query("create table if not exists users_clone as select * from users")

In [74]:
query("INSERT INTO users_clone SELECT * FROM users")

In [76]:
query("INSERT OVERWRITE TABLE users_clone SELECT * FROM users")

In [77]:
query("SELECT * FROM users_clone")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [82]:
# 최대 vacuum 시간 미만으로 지정하는 경우 오류를 발생
query("VACUUM users RETAIN 24 HOURS DRY RUN")

IllegalArgumentException: requirement failed: Are you sure you would like to vacuum files with such a low retention period? If you have
writers that are currently writing to this table, there is a risk that you may corrupt the
state of your Delta table.

If you are certain that there are no operations being performed on this table, such as
insert/upsert/delete/optimize, then you may turn off this check by setting:
spark.databricks.delta.retentionDurationCheck.enabled = false

If you are not sure, please use a value not less than "168 hours".
       

In [83]:
query("SET spark.databricks.delta.retentionDurationCheck.enabled = false")

key,value
spark.databricks.delta.retentionDurationCheck.enabled,False


In [140]:
query("SET spark.databricks.delta.retentionDurationCheck.enabled = true")

key,value
spark.databricks.delta.retentionDurationCheck.enabled,True


In [84]:
query("VACUUM users RETAIN 24 HOURS DRY RUN")

path


In [86]:
query("describe history users")

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2023-01-01 11:59:22,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,2.0,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2023-01-01 11:59:16,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,Serializable,True,"{numFiles -> 2, numOutputRows -> 1, numOutputBytes -> 3130}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2023-01-01 11:59:00,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,0.0,Serializable,False,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2022-12-30 11:42:08,,,CREATE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [87]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [89]:
query("VACUUM users RETAIN 1 HOURS DRY RUN").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------+
|path                                                                                                            |
+----------------------------------------------------------------------------------------------------------------+
|file:/home/jovyan/work/spark-warehouse/users/part-00002-b5e2db62-800d-404f-9750-ff24b9fbc75c-c000.snappy.parquet|
|file:/home/jovyan/work/spark-warehouse/users/part-00000-8c43d864-9d91-4a46-8a5d-246e2899f237-c000.snappy.parquet|
|file:/home/jovyan/work/spark-warehouse/users/part-00005-9e6a865b-ef24-419b-bf85-3998f26b4883-c000.snappy.parquet|
+----------------------------------------------------------------------------------------------------------------+



In [90]:
query("VACUUM users RETAIN 1 HOURS")

path
file:/home/jovyan/work/spark-warehouse/users


In [91]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [92]:
query("VACUUM users RETAIN 0.1 HOURS")

path
file:/home/jovyan/work/spark-warehouse/users


In [99]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [100]:
# 임의로 파일 삭제 리텐션을 수정

query("""
SET delta.deletedFileRetentionDuration = "interval 1 hour"
""")

key,value
delta.deletedFileRetentionDuration,"""interval 1 hour"""


In [105]:
# query("vacuum delta.`./spark-warehouse/users` retain 0 hours dry run")
from delta.tables import *

pathToTable = "./spark-warehouse/users"
deltaTable = DeltaTable.forPath(spark, pathToTable)

In [106]:
deltaTable.vacuum()        # vacuum files not required by versions older than the default retention period

In [108]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [113]:
deltaTable.vacuum(0)

In [129]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [120]:
query("describe history users")

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2023-01-01 11:59:22,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,2.0,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2023-01-01 11:59:16,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,Serializable,True,"{numFiles -> 2, numOutputRows -> 1, numOutputBytes -> 3130}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2023-01-01 11:59:00,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,0.0,Serializable,False,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2022-12-30 11:42:08,,,CREATE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [142]:
from delta.tables import *

pathToTable = "/home/jovyan/work/spark-warehouse/users"
deltaTable = DeltaTable.forPath(spark, pathToTable)
deltaTable.logRetentionDuration  = "interval 7 days"
deltaTable.deletedFileRetentionDuration  = "interval 1 days"

In [143]:
query("VACUUM users RETAIN 48 HOURS DRY RUN")

IllegalArgumentException: requirement failed: Are you sure you would like to vacuum files with such a low retention period? If you have
writers that are currently writing to this table, there is a risk that you may corrupt the
state of your Delta table.

If you are certain that there are no operations being performed on this table, such as
insert/upsert/delete/optimize, then you may turn off this check by setting:
spark.databricks.delta.retentionDurationCheck.enabled = false

If you are not sure, please use a value not less than "168 hours".
       

In [128]:
query("VACUUM users RETAIN 0 HOURS")

path
file:/home/jovyan/work/spark-warehouse/users


In [138]:
df = spark.read.format('delta').load(pathToTable)
df.count()

5

In [139]:
dx = spark.read.parquet(pathToTable)
dx.count()

5

In [132]:
query("VACUUM users DRY RUN")

path


In [135]:
!pwd

/home/jovyan/work


In [136]:
query("VACUUM delta.`/home/jovyan/work/spark-warehouse/users` RETAIN 100 HOURS")

path
file:/home/jovyan/work/spark-warehouse/users


In [141]:
query("VACUUM delta.`/home/jovyan/work/spark-warehouse/users` RETAIN 1 HOURS")

IllegalArgumentException: requirement failed: Are you sure you would like to vacuum files with such a low retention period? If you have
writers that are currently writing to this table, there is a risk that you may corrupt the
state of your Delta table.

If you are certain that there are no operations being performed on this table, such as
insert/upsert/delete/optimize, then you may turn off this check by setting:
spark.databricks.delta.retentionDurationCheck.enabled = false

If you are not sure, please use a value not less than "168 hours".
       

In [149]:
from delta.tables import *

pathToTable = "./spark-warehouse/users"
# users = DeltaTable.forName(spark, "default.users") # hive-metastore
users = DeltaTable.forPath(spark, pathToTable)

In [151]:
help(users)

Help on DeltaTable in module delta.tables object:

class DeltaTable(builtins.object)
 |  DeltaTable(spark: pyspark.sql.session.SparkSession, jdt: 'JavaObject')
 |  
 |  Main class for programmatically interacting with Delta tables.
 |  You can create DeltaTable instances using the path of the Delta table.::
 |  
 |      deltaTable = DeltaTable.forPath(spark, "/path/to/table")
 |  
 |  In addition, you can convert an existing Parquet table in place into a Delta table.::
 |  
 |      deltaTable = DeltaTable.convertToDelta(spark, "parquet.`/path/to/table`")
 |  
 |  .. versionadded:: 0.4
 |  
 |  Methods defined here:
 |  
 |  __init__(self, spark: pyspark.sql.session.SparkSession, jdt: 'JavaObject')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  alias(self, aliasName: str) -> 'DeltaTable'
 |      Apply an alias to the Delta table.
 |      
 |      .. versionadded:: 0.4
 |  
 |  delete(self, condition: Union[str, pyspark.sql.column.Column, NoneType] = None

In [152]:
users.history()

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2023-01-01 11:59:22,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,2.0,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2023-01-01 11:59:16,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,Serializable,True,"{numFiles -> 2, numOutputRows -> 1, numOutputBytes -> 3130}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2023-01-01 11:59:00,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,0.0,Serializable,False,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2022-12-30 11:42:08,,,CREATE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [153]:
users.vacuum()

In [156]:
pathToTable = "./spark-warehouse/users"
df1 = spark.read.format("delta").load(pathToTable)
df1.write.format("delta").mode("overwrite").save("./delta-warehouse/users")

In [157]:
tbl1 = DeltaTable.forPath(spark, pathToTable)
tbl1.history()

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2023-01-01 11:59:22,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,2.0,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2023-01-01 11:59:16,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,Serializable,True,"{numFiles -> 2, numOutputRows -> 1, numOutputBytes -> 3130}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2023-01-01 11:59:00,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,0.0,Serializable,False,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
0,2022-12-30 11:42:08,,,CREATE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [], properties -> {}}",,,,,Serializable,True,"{numFiles -> 3, numOutputRows -> 2, numOutputBytes -> 5392}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [159]:
tbl1.vacuum()

In [163]:
tbl1.toDF().show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+----------+------+------+



In [162]:
tbl1.vacuum(169)

In [164]:
query("SHOW TBLPROPERTIES default.users")

key,value
Type,MANAGED
delta.minReaderVersion,1
delta.minWriterVersion,2


In [165]:
query("describe detail users")

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
delta,39c271df-b1eb-48d2-ad36-c039e6d04a17,default.users,,file:/home/jovyan/work/spark-warehouse/users,2022-12-30 11:42:07.508,2023-01-01 11:59:22,[],8,13914,{},1,2


In [169]:
query("ALTER TABLE default.users SET TBLPROPERTIES ('delta.logRetentionDuration' = 'interval 7 days', 'delta.deletedFileRetentionDuration' = 'interval 1 days')")

In [170]:
query("show tblproperties users")

key,value
Type,MANAGED
delta.deletedFileRetentionDuration,interval 1 days
delta.logRetentionDuration,interval 7 days
delta.minReaderVersion,1
delta.minWriterVersion,2


In [171]:
query("VACUUM users DRY RUN")

path


In [172]:
query("select * from users")

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
1,suhyuk,psyoblade,park,male,2000/10/30,741030,1000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
2,youngmi,kiki,kim,female,2004/08/08,770808,2000
3,sowon,eva,park,female,2005/05/20,40520,3000


In [32]:
users = DeltaTable.forPath(spark, "./spark-warehouse/users")
users.history()

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2023-01-09 08:58:34,,,CREATE OR REPLACE TABLE AS SELECT,"{isManaged -> true, description -> null, partitionBy -> [""salary""], properties -> {}}",,,,,Serializable,False,"{numFiles -> 4, numOutputRows -> 4, numOutputBytes -> 7980}",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


In [33]:
table = spark.read.format("delta").load("./spark-warehouse/users")
table.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
|  4|    sihun|      sean|    park|  male|2006/01/14|080114|  4000|
+---+---------+----------+--------+------+----------+------+------+



In [15]:
# 연봉을 기준으로 파티셔닝해서
table.write.format("delta").mode("overwrite").partitionBy("salary").save("./spark-warehouse/users")

In [34]:
# 특정 연봉의 이용자만 replaceWhere 구문으로 변경해보자
poor = spark.read.format("delta").load("./spark-warehouse/users")
pure = poor.distinct()
pure.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
|  4|    sihun|      sean|    park|  male|2006/01/14|080114|  4000|
+---+---------+----------+--------+------+----------+------+------+



In [35]:
# 실제 데이터프레임에 저장되어 있는 데이터와, 저장하려는 데이터의 범위가 맞지 않으면 예외를 던진다
# pure.write.format("delta").mode("overwrite").option("replaceWhere", "salary <= 2000").save("./delta-warehouse/users")
# 변경하고 싶은 범위와 파티션 범위를 동일하게 만들고 replaceWhere 적용을 해야 하는것으로 보인다
criteria = pure.where("salary <= 2000")
criteria.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
+---+---------+----------+--------+------+----------+------+------+



In [36]:
criteria.write.format("delta").mode("overwrite").option("replaceWhere", "salary = 1000").save("./spark-warehouse/users")

AnalysisException: Data written out does not match replaceWhere 'salary = 1000'.
CHECK constraint EXPRESSION(('salary = 1000)) (salary = 1000) violated by row with values:
 - salary : 2000

In [38]:
criteria.write.format("delta").mode("append").option("replaceWhere", "salary = 1000").saveAsTable("users")

In [186]:
result = spark.read.format("delta").load("./delta-warehouse/users")
result.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+----------+------+------+



In [187]:
wrong = spark.read.parquet("./delta-warehouse/users")
wrong.show()

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  2|  youngmi|      kiki|     kim|female|2004/08/08|770808|  2000|
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
|  3|    sowon|       eva|    park|female|2005/05/20|040520|  3000|
+---+---------+----------+--------+------+------

In [188]:
# replaceWhere 조건이 왜 필요한가? 어차피 범위를 확인해서 필터해야 하는거라면?
# 아니지 overwrite 이므로 조건을 지정하지 않으면 모든 데이터가 삭제되므로, replaceWhere 절은 반드시 필요하다
# 결국 replaceWhere 절은 overwrite 시에 필요한 만큼만 overwrite 한다는 의미이지, 데이터의 검증까지 하는 것은 아니다

+---+---------+----------+--------+------+----------+------+------+
| id|firstName|middleName|lastName|gender| birthDate|   ssn|salary|
+---+---------+----------+--------+------+----------+------+------+
|  1|   suhyuk| psyoblade|    park|  male|2000/10/30|741030|  1000|
+---+---------+----------+--------+------+----------+------+------+



In [8]:
sql("show databases; show tables ; use taxidb ; show tables ")

+---------+
|namespace|
+---------+
|default  |
|taxidb   |
+---------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|default  |family   |false      |
|default  |users    |false      |
+---------+---------+-----------+

++
||
++
++

+---------+------------------+-----------+
|namespace|tableName         |isTemporary|
+---------+------------------+-----------+
|taxidb   |greentaxis        |false      |
|taxidb   |yellowtaxis       |false      |
|taxidb   |yellowtaxis_append|false      |
+---------+------------------+-----------+



In [16]:
history = sql("describe history yellowtaxis")
history.printSchema()

root
 |-- version: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- userId: string (nullable = true)
 |-- userName: string (nullable = true)
 |-- operation: string (nullable = true)
 |-- operationParameters: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- job: struct (nullable = true)
 |    |-- jobId: string (nullable = true)
 |    |-- jobName: string (nullable = true)
 |    |-- runId: string (nullable = true)
 |    |-- jobOwnerId: string (nullable = true)
 |    |-- triggerType: string (nullable = true)
 |-- notebook: struct (nullable = true)
 |    |-- notebookId: string (nullable = true)
 |-- clusterId: string (nullable = true)
 |-- readVersion: long (nullable = true)
 |-- isolationLevel: string (nullable = true)
 |-- isBlindAppend: boolean (nullable = true)
 |-- operationMetrics: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- userMetadata: string (nullable =

In [19]:
show("describe extended yellowtaxis", 50)

+----------------------------+-------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                      |comment|
+----------------------------+-------------------------------------------------------------------------------+-------+
|RideId                      |int                                                                            |       |
|VendorId                    |int                                                                            |       |
|PickupTime                  |timestamp                                                                      |       |
|DropTime                    |timestamp                                                                      |       |
|PickupLocationId            |int                                                                            |       |
|DropLocationId              |int               

In [21]:
ls("/home/jovyan/work/data/yellowTaxis.delta/_delta_log")

total 24
drwxrwxrwx 1 jovyan 1000  512 Aug 28 12:37 .
drwxrwxrwx 1 jovyan 1000  512 Aug 28 12:37 ..
-rwxrwxrwx 1 jovyan 1000 2177 Aug 14 10:52 00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000   28 Aug 14 10:52 .00000000000000000000.json.crc
-rwxrwxrwx 1 jovyan 1000 1861 Aug 14 10:52 00000000000000000001.json
-rwxrwxrwx 1 jovyan 1000   24 Aug 14 10:52 .00000000000000000001.json.crc
-rwxrwxrwx 1 jovyan 1000 1861 Aug 28 06:03 00000000000000000002.json
-rwxrwxrwx 1 jovyan 1000   24 Aug 28 06:03 .00000000000000000002.json.crc
-rwxrwxrwx 1 jovyan 1000 3354 Aug 28 06:08 00000000000000000003.json
-rwxrwxrwx 1 jovyan 1000   36 Aug 28 06:08 .00000000000000000003.json.crc
-rwxrwxrwx 1 jovyan 1000 4133 Aug 28 12:37 00000000000000000004.json
-rwxrwxrwx 1 jovyan 1000   44 Aug 28 12:37 .00000000000000000004.json.crc
drwxrwxrwx 1 jovyan 1000  512 Aug 28 04:41 .ipynb_checkpoints


In [27]:
grep_sed_json("add", 1, "/home/jovyan/work/data/yellowTaxis.delta/_delta_log/00000000000000000004.json")

{
    "add": {
        "path": "part-00000-a2bcac59-0d4f-40f8-bcdb-57e66bbb00f2-c000.snappy.parquet",
        "partitionValues": {},
        "size": 5252,
        "modificationTime": 1724848669406,
        "dataChange": true,
        "stats": "{\"numRecords\":1,\"minValues\":{\"RideId\":9999995,\"VendorId\":1,\"PickupTime\":\"2019-11-01T09:00:00.000+09:00\",\"DropTime\":\"2019-11-01T09:02:23.573+09:00\",\"PickupLocationId\":65,\"DropLocationId\":71,\"CabNumber\":\"TAC304\",\"DriverLicenseNumber\":\"453987\",\"PassengerCount\":5,\"TripDistance\":4.5,\"RatecodeId\":1,\"PaymentType\":1,\"TotalAmount\":20.34,\"FareAmount\":15.0,\"Extra\":0.5,\"MtaTax\":0.4,\"TipAmount\":2.0,\"TollsAmount\":2.0,\"ImprovementSurcharge\":1.1},\"maxValues\":{\"RideId\":9999995,\"VendorId\":1,\"PickupTime\":\"2019-11-01T09:00:00.000+09:00\",\"DropTime\":\"2019-11-01T09:02:23.573+09:00\",\"PickupLocationId\":65,\"DropLocationId\":71,\"CabNumber\":\"TAC304\",\"DriverLicenseNumber\":\"453987\",\"PassengerCount\":5

In [30]:
spark.table("taxidb.greentaxis").show(3, truncate=False)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|2       |2019-12-05 07:43:41 |2019-12-05 07:50:30  |N                 |1         |97          |65          |1              |0.87         |6.0        |0.0  |0.5   