In [2]:
pip install delta-spark


Note: you may need to restart the kernel to use updated packages.


In [3]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [13]:
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS gold LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/'
    """
)

DataFrame[]

In [4]:
# create AirQuality table in gold

spark.sql(
    """
    DROP TABLE IF EXISTS Trabalho.gold_AirQuality
    """
)

DataFrame[]

In [14]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Trabalho.gold_AirQuality (
        Name STRING,
        Geo_Place_Name STRING,
        Time_Period STRING,
        Data_Value FLOAT,
        Year INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_AirQuality/'
    """
)

DataFrame[]

In [15]:
from pyspark.sql.functions import substring, avg, sum

#read AirQuality from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Qualidade_Ar"

gold_AirQuality = spark\
             .read\
             .load(hdfs_path)

gold_AirQuality.show()
gold_AirQuality.printSchema()

+---------+------------+--------------------+--------------+------------+-------------+-----------+--------------------+-----------+----------+----------+----+
|Unique_ID|Indicator_ID|                Name|       Measure|Measure_Info|Geo_Type_Name|Geo_Join_ID|      Geo_Place_Name|Time_Period|Start_Date|Data_Value|Year|
+---------+------------+--------------------+--------------+------------+-------------+-----------+--------------------+-----------+----------+----------+----+
|   179718|         642|Boiler Emissions-...|Number per km2|      number|        UHF42|        504|South Beach - Tot...|       2015|01/01/2015|       2.0|2015|
|   179719|         642|Boiler Emissions-...|Number per km2|      number|        UHF42|        503|         Willowbrook|       2015|01/01/2015|       2.1|2015|
|   179720|         642|Boiler Emissions-...|Number per km2|      number|        UHF42|        501|       Port Richmond|       2015|01/01/2015|       2.8|2015|
|   179721|         642|Boiler Emissions

In [16]:
#group by name, geo place name
from pyspark.sql.functions import sum,avg,max,count
gold = gold_AirQuality.groupBy("Name","Geo_Place_Name").count()\
    .show()

+--------------------+--------------------+-----+
|                Name|      Geo_Place_Name|count|
+--------------------+--------------------+-----+
|Boiler Emissions-...|             Jamaica|    2|
|Boiler Emissions-...|    Southwest Queens|    2|
|Boiler Emissions-...|Pelham - Throgs Neck|    2|
|Fine Particulate ...|       Port Richmond|   36|
|          Ozone (O3)|Financial Distric...|   12|
|PM2.5-Attributabl...|Bensonhurst - Bay...|    4|
|PM2.5-Attributabl...|       Staten Island|    4|
|PM2.5-Attributabl...|            Brooklyn|    4|
|O3-Attributable A...|          Greenpoint|    8|
|O3-Attributable C...|Downtown - Height...|    4|
|Air Toxics Concen...|Central Harlem (C...|    1|
|Nitrogen Dioxide ...|  Washington Heights|   72|
|          Ozone (O3)|       Fresh Meadows|   12|
|          Ozone (O3)|Throgs Neck and C...|   12|
|Sulfur Dioxide (SO2)|           Rockaways|   16|
|O3-Attributable A...|   Chelsea - Clinton|    8|
|Air Toxics Concen...|Upper East Side (...|    1|


In [17]:
# write to delta table
gold_AirQuality \
    .select("Name","Geo_Place_Name","Time_Period","Data_Value","Year") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_AirQuality/")

In [18]:
# check the results in the table
spark.table("Trabalho.gold_AirQuality ").show()
spark.table("Trabalho.gold_AirQuality ").count()

+--------------------+--------------------+-----------+----------+----+
|                Name|      Geo_Place_Name|Time_Period|Data_Value|Year|
+--------------------+--------------------+-----------+----------+----+
|Boiler Emissions-...|South Beach - Tot...|       2015|       2.0|2015|
|Boiler Emissions-...|         Willowbrook|       2015|       2.1|2015|
|Boiler Emissions-...|       Port Richmond|       2015|       2.8|2015|
|Boiler Emissions-...|Stapleton - St. G...|       2015|       4.6|2015|
|Boiler Emissions-...|           Rockaways|       2015|       6.1|2015|
|Boiler Emissions-...|Canarsie - Flatlands|       2015|       6.6|2015|
|Boiler Emissions-...|    Southeast Queens|       2015|       8.1|2015|
|Boiler Emissions-...|Bayside - Little ...|       2015|       9.8|2015|
|Boiler Emissions-...|             Jamaica|       2015|      13.1|2015|
|Boiler Emissions-...|         Sunset Park|       2015|      13.9|2015|
|Boiler Emissions-...|    Southwest Queens|       2015|      14.

16122

In [19]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_AirQuality/`
""").show()

++
||
++
++



In [20]:
spark.sql("""
DROP TABLE IF EXISTS Trabalho.gold_AirQuality_presto 
""").show()


++
||
++
++



In [4]:

spark.sql("""
    CREATE EXTERNAL TABLE Trabalho.gold_AirQuality_presto (
        Name STRING,
        Geo_Place_Name STRING,
        Time_Period STRING,
        Data_Value FLOAT,
        Year INT

    )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_AirQuality/_symlink_format_manifest'
""").show()

AnalysisException: Table or view 'gold_airquality_presto' already exists in database 'trabalho'