# 1. 델타 레이크 테이블 실습
## 1.1 델타 레이크 의존성을 포함한 스파크 세션 생성

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [2]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

## 1-2. 자주 사용하는 함수 등록 및 예제 데이터 읽어오기

In [46]:
def sql(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

def sql2(queries, num_rows = 20):
    for query in queries.split(";"):
        display(spark.sql(query).limit(num_rows))

In [14]:
tsv = (
    spark.read
    .option("delimiter", "\t")
    .option("inferSchema", "true")
    .option("header", "true")
    .csv("imdb")
)
tsv.printSchema()

root
 |-- rank: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- description: string (nullable = true)
 |-- director: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- votes: integer (nullable = true)
 |-- revenue: double (nullable = true)
 |-- metascore: integer (nullable = true)



In [16]:
sql("show databases")
sql("show tables")

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+

+---------+
|namespace|
+---------+
|default  |
+---------+



In [17]:
tsv.write.format("delta").mode("overwrite").saveAsTable("default.imdb_delta")

In [19]:
sql("show databases")
sql("show tables")

+---------+
|namespace|
+---------+
|default  |
+---------+

+---------+----------+-----------+
|namespace|tableName |isTemporary|
+---------+----------+-----------+
|default  |imdb_delta|false      |
+---------+----------+-----------+



In [28]:
sql("""
select rank, title, metascore from imdb_delta where genre like '%Sci%' order by rank asc limit 10
""")

+----+--------------------------------+---------+
|rank|title                           |metascore|
+----+--------------------------------+---------+
|1   |Guardians of the Galaxy         |76       |
|2   |Prometheus                      |65       |
|13  |Rogue One                       |65       |
|20  |Arrival                         |81       |
|25  |Independence Day: Resurgence    |32       |
|33  |X-Men: Apocalypse               |52       |
|35  |Resident Evil: The Final Chapter|49       |
|36  |Captain America: Civil War      |75       |
|37  |Interstellar                    |74       |
|49  |Star Trek Beyond                |68       |
+----+--------------------------------+---------+



## 1-4. 장르별 빈도수를 계산하고 특정 장르 하나를 모두 삭제합니다

In [32]:
sql("select year, count(1) as cnt from imdb_delta group by year order by year asc", 100)

+----+---+
|year|cnt|
+----+---+
|2006|44 |
|2007|53 |
|2008|52 |
|2009|51 |
|2010|60 |
|2011|63 |
|2012|64 |
|2013|91 |
|2014|98 |
|2015|127|
|2016|297|
+----+---+



In [33]:
tsv.where("year = 2010").count()

60

In [34]:
sql("delete from imdb_delta where year = 2010")

+-----------------+
|num_affected_rows|
+-----------------+
|60               |
+-----------------+



In [35]:
sql("select year, count(1) as cnt from imdb_delta group by year order by year asc", 100)

+----+---+
|year|cnt|
+----+---+
|2006|44 |
|2007|53 |
|2008|52 |
|2009|51 |
|2011|63 |
|2012|64 |
|2013|91 |
|2014|98 |
|2015|127|
|2016|297|
+----+---+



In [47]:
sql2("describe history imdb_delta")

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2025-08-30 17:29:40,,,DELETE,"{predicate -> [""(year#3842 = 2010)""]}",,,,0.0,Serializable,False,"{numRemovedFiles -> 1, numRemovedBytes -> 195473, numCopiedRows -> 940, numDeletionVectorsAdded -...",,Apache-Spark/3.5.3 Delta-Lake/3.2.1
0,2025-08-30 17:13:06,,,CREATE OR REPLACE TABLE AS SELECT,"{partitionBy -> [], clusterBy -> [], description -> NULL, isManaged -> true, properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 195473}",,Apache-Spark/3.5.3 Delta-Lake/3.2.1


In [52]:
spark.sql("select * from imdb_delta version as of 0").groupBy("year").count().orderBy(asc("year"))

year,count
2006,44
2007,53
2008,52
2009,51
2010,60
2011,63
2012,64
2013,91
2014,98
2015,127


In [53]:
spark.sql("select * from imdb_delta version as of 1").groupBy("year").count().orderBy(asc("year"))

year,count
2006,44
2007,53
2008,52
2009,51
2011,63
2012,64
2013,91
2014,98
2015,127
2016,297


In [56]:
year_2010 = tsv.where("year = 2010")
year_2010.count()

60

In [57]:
year_2010.write.format("delta").mode("append").saveAsTable("default.imdb_delta")

In [59]:
spark.sql("select year, count(1) as cnt from imdb_delta group by year order by year asc").show(100, False)

+----+---+
|year|cnt|
+----+---+
|2006|44 |
|2007|53 |
|2008|52 |
|2009|51 |
|2010|60 |
|2011|63 |
|2012|64 |
|2013|91 |
|2014|98 |
|2015|127|
|2016|297|
+----+---+



In [61]:
display(spark.sql("describe history imdb_delta").orderBy(asc("version")))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2025-08-30 17:13:06,,,CREATE OR REPLACE TABLE AS SELECT,"{partitionBy -> [], clusterBy -> [], description -> NULL, isManaged -> true, properties -> {}}",,,,,Serializable,False,"{numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 195473}",,Apache-Spark/3.5.3 Delta-Lake/3.2.1
1,2025-08-30 17:29:40,,,DELETE,"{predicate -> [""(year#3842 = 2010)""]}",,,,0.0,Serializable,False,"{numRemovedFiles -> 1, numRemovedBytes -> 195473, numCopiedRows -> 940, numDeletionVectorsAdded -...",,Apache-Spark/3.5.3 Delta-Lake/3.2.1
2,2025-08-30 17:47:57,,,WRITE,"{mode -> Append, partitionBy -> []}",,,,1.0,Serializable,True,"{numFiles -> 1, numOutputRows -> 60, numOutputBytes -> 18002}",,Apache-Spark/3.5.3 Delta-Lake/3.2.1
