# PySpark & Parquet
Basic examples manipulating parquet tables

Reference:
- [Spark SQL Reference](http://spark.apache.org/docs/latest/sql-ref-syntax.html)
- [PySpark API](http://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

In [None]:
%%bash
ls -al ${APP_HOME}/

## Create Spark session

In [None]:
spark = (
    SparkSession
    .builder
    .appName("example-2-parquet")
    .master("local[*]")
    .getOrCreate()
)

## Read Parquet

### Display Schema

In [None]:
(
    spark
    .read
    .format('parquet')
    .load("/opt/spark/jupyter-lib/output_data/stage/vitals/parquet/2021/10/24")  # As DataFrame
    .dtypes
)

### Select specific attributes

In [None]:
(
    spark
    .read
    .format('parquet')
    .load("/opt/spark/jupyter-lib/output_data/stage/vitals/parquet/2021/10/24")  # As DataFrame
    .select(
        col("source_ale_prac_id").alias("practice"),
        "patient_id",
        "service_date",
        "service_time",
        "bp_systolic",
        "bp_diastolic",
        "source"
    )
    .filter("service_date IS NOT NULL AND service_time IS NOT NULL")
    .filter(col("bp_systolic").isNotNull() & col("bp_diastolic").isNotNull())
    .show(n=7)
)