# PySpark & Delta Lake
Basic examples manipulating Delta tables

Reference:
- [Delta Lake Docs](https://docs.delta.io/latest/index.html)
- [Delta Lake API](https://docs.delta.io/latest/api/python/index.html)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
from delta.tables import DeltaTable

In [None]:
%%bash
ls -al ${APP_HOME}/

## Create Spark session with Delta Lake bindings

In [None]:
spark = (
    SparkSession
    .builder
    .appName("example-1-delta")
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

## Read Delta table

In [None]:
(
    spark
    .read
    .format('delta')
    .load("/opt/spark/jupyter-lib/output_data/public/vitals/delta")  # As DataFrame
    .select(
        col("source_ale_prac_id").alias("practice"),
        "patient_id",
        "name",
        "value",
        "observation_date"
    )
    .show(n=7)
)

## Inspect Delta version history

In [None]:
    (
        DeltaTable
        .forPath(spark, '/opt/spark/jupyter-lib/output_data/public/vitals/delta')
        .history(21)
        .withColumn("row_count", expr("operationMetrics.numOutputRows"))
        .select(
            "version",
            "timestamp",
            "userName",
            "operation",
            "job",
            # "operationParameters",
            # "operationMetrics",
            "row_count"
        )
        .show(truncate=False)
    )
