# Reading a Delta Table
### Reading a Delta Table with SQL

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

from delta import *

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]
warehouse_dir = f"{work_dir}/spark-warehouse"

# Create spark session with hive enabled
builder = (
    SparkSession
    .builder
    .appName("pyspark-notebook")
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalogImplementation", "hive")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .enableHiveSupport()
)

In [2]:
# 델타 레이크 생성시에 반드시 `configure_spark_with_delta_pip` 구성을 통해 실행되어야 정상적인 델타 의존성이 로딩됩니다
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [22]:
def sql(queries, num_rows = 20):
    for query in queries.split(";"):
        spark.sql(query).show(num_rows, truncate=False)

def ls(command):
    !ls -al {command}

def cat(filename):
    !cat {filename}

def grep(keyword, filename):
    !grep -i {keyword} {filename}

def grep_and_json(keyword, filename):
    !grep {keyword} {filename} | python -m json.tool

In [5]:
sql("show databases ; use taxidb ; show tables")

+---------+
|namespace|
+---------+
|default  |
|taxidb   |
+---------+

++
||
++
++

+---------+-----------+-----------+
|namespace|tableName  |isTemporary|
+---------+-----------+-----------+
|taxidb   |greentaxis |false      |
|taxidb   |yellowtaxis|false      |
+---------+-----------+-----------+



In [13]:
spark.read.parquet(f"{work_data}/yellowtaxis.parquet").printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [18]:
# DeltaTable API 수준에서 Drop 하는 API 제공되지 않음 - default 는 대소문자를 가리지 않음 (greentaxis)
sql(f"""
CREATE TABLE IF NOT EXISTS taxidb.YellowTaxis
USING DELTA
LOCATION '{work_data}/yellowTaxis.delta'
""")

++
||
++
++



In [23]:
sql("select * from taxidb.yellowtaxis ; describe table formatted taxidb.yellowtaxis", 50)

+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|RideId |VendorId|PickupTime         |DropTime               |PickupLocationId|DropLocationId|CabNumber|DriverLicenseNumber|PassengerCount|TripDistance|RatecodeId|PaymentType|TotalAmount|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|9999995|1       |2019-11-01 09:00:00|2019-11-01 09:02:23.573|65              |71            |TAC304   |453987             |2             |4.5         |1         |1          |20.34      |15.0      |0.5  |0.4   |2.0      |2.0        |1.

In [9]:
deltaTable = (
    DeltaTable.createIfNotExists(spark)
    .tableName("taxidb.greenTaxis")
    .addColumn("VendorId", "INT", comment = "Ride Vendor")
    .addColumn("lpep_pickup_datetime", "STRING")
    .addColumn("lpep_dropoff_datetime", "STRING")
    .addColumn("store_and_fwd_flag", "STRING")
    .addColumn("RatecodeID", "INT", comment = "Ref to RateCard")
    .addColumn("PULocationID", "INT")
    .addColumn("DOLocationID", "INT")
    .addColumn("passenger_count", "INT")
    .addColumn("trip_distance", "DOUBLE")
    .addColumn("fare_amount", "DOUBLE")
    .addColumn("extra", "DOUBLE")
    .addColumn("mta_tax", "DOUBLE")
    .addColumn("tip_amount", "DOUBLE")
    .addColumn("tolls_amount", "DOUBLE")
    .addColumn("ehail_fee", "STRING")
    .addColumn("improvement_surcharge", "DOUBLE")
    .addColumn("total_amount", "DOUBLE")
    .addColumn("payment_type", "INT")
    .addColumn("trip_type", "INT")
    .addColumn("congestion_surcharge", "DOUBLE")
)
deltaTable.execute()

<delta.tables.DeltaTable at 0x7f0cd1795130>

In [12]:
ls(f"{work_dir}/spark-warehouse/taxidb.db/greentaxis/_delta_log")

total 4
drwxrwxrwx 1 jovyan 1000  512 Aug 28 05:07 .
drwxrwxrwx 1 jovyan 1000  512 Aug 28 05:07 ..
-rwxrwxrwx 1 jovyan 1000 2346 Aug 28 05:07 00000000000000000000.json
-rwxrwxrwx 1 jovyan 1000   28 Aug 28 05:07 .00000000000000000000.json.crc


In [14]:
csvGreenTaxi = spark.read.format("csv").option("inferSchema", True).option("header", True).load(f"{work_dir}/data/greenTaxis.csv")
csvGreenTaxi.printSchema()
csvGreenTaxi.show(1, truncate=False)

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)

+--------+--------------------+---------------------+------------------+----------+-------

In [25]:
sql("show tables ; desc yellowtaxis")

+---------+-----------+-----------+
|namespace|tableName  |isTemporary|
+---------+-----------+-----------+
|taxidb   |greentaxis |false      |
|taxidb   |yellowtaxis|false      |
+---------+-----------+-----------+

+--------------------+---------+-------+
|col_name            |data_type|comment|
+--------------------+---------+-------+
|RideId              |int      |       |
|VendorId            |int      |       |
|PickupTime          |timestamp|       |
|DropTime            |timestamp|       |
|PickupLocationId    |int      |       |
|DropLocationId      |int      |       |
|CabNumber           |string   |       |
|DriverLicenseNumber |string   |       |
|PassengerCount      |int      |       |
|TripDistance        |double   |       |
|RatecodeId          |int      |       |
|PaymentType         |int      |       |
|TotalAmount         |double   |       |
|FareAmount          |double   |       |
|Extra               |double   |       |
|MtaTax              |double   |       |
|Tip

In [30]:
sql("""
SELECT
    CabNumber,
    AVG(FareAmount) AS AverageFare
FROM
    taxidb.yellowtaxis
GROUP BY
    CabNumber
HAVING
    AVG(FareAmount) > 10
ORDER BY
    2 DESC
LIMIT 5
""")

+---------+-----------+
|CabNumber|AverageFare|
+---------+-----------+
|TAC304   |15.0       |
+---------+-----------+



### Reading a Table with PySpark

In [14]:
spark.read.format("delta").load(f"{work_data}/yellowtaxis.delta").show(truncate=False)

+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|RideId |VendorId|PickupTime         |DropTime               |PickupLocationId|DropLocationId|CabNumber|DriverLicenseNumber|PassengerCount|TripDistance|RatecodeId|PaymentType|TotalAmount|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|9999995|1       |2019-11-01 09:00:00|2019-11-01 09:02:23.573|65              |71            |TAC304   |453987             |2             |4.5         |1         |1          |20.34      |15.0      |0.5  |0.4   |2.0      |2.0        |1.

In [31]:
yellow = spark.read.format("delta").table("taxidb.yellowtaxis")
yellow.show(truncate=False)

+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|RideId |VendorId|PickupTime         |DropTime               |PickupLocationId|DropLocationId|CabNumber|DriverLicenseNumber|PassengerCount|TripDistance|RatecodeId|PaymentType|TotalAmount|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+-------+--------+-------------------+-----------------------+----------------+--------------+---------+-------------------+--------------+------------+----------+-----------+-----------+----------+-----+------+---------+-----------+--------------------+
|9999995|1       |2019-11-01 09:00:00|2019-11-01 09:02:23.573|65              |71            |TAC304   |453987             |2             |4.5         |1         |1          |20.34      |15.0      |0.5  |0.4   |2.0      |2.0        |1.

In [39]:
results = (
    yellow.groupBy("CabNumber")
    .agg(avg("FareAmount").alias("AverageFare"))
    .filter(col("AverageFare") > 10)
    .sort(col("AverageFare").desc())
    .take(5)
)

In [42]:
[result for result in results]

[Row(CabNumber='TAC304', AverageFare=15.0)]