In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
# create gold database
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS gold_projeto LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/gold_projeto.db/'
    """
)

# create sales_per_country table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS gold_projeto.waterDF
    """
)



DataFrame[]

In [2]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Trabalho.gold_WaterQuality (
        Site STRING,
        Date STRING,
        Analyte STRING,
        Status STRING,
        Final_Result STRING,
        Units STRING,
        WTP_Group STRING,
        Year INT
    )
    USING DELTA
    
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_WaterQuality/'
    """
)

DataFrame[]

In [5]:
from pyspark.sql.functions import substring, avg, sum

# read AirQuality from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Qualidade_Agua/"

gold_WaterQuality = spark\
          .read\
          .load(hdfs_path)

gold_WaterQuality.show()
gold_WaterQuality.printSchema()

+---------+--------------------+----------+-----+--------------------+------+------------+---------+---------------------+--------------------+------------+----+
|Sample_Id|                Site|      Date| Time|             Analyte|Status|Final_Result|    Units|Water_Treatment_Plant|           WTP_Group|SPDES_Number|Year|
+---------+--------------------+----------+-----+--------------------+------+------------+---------+---------------------+--------------------+------------+----+
|  D-96685|                PRHE|04/07/1998| 8:58|                 BOD|  null|         8.2|     mg/L|             Pepacton|      Non-City Owned|        null|1998|
|  D-96744|                PRHE|05/05/1998| 9:05|                  pH|  null|         6.1|     null|             Pepacton|      Non-City Owned|        null|1998|
|  B-21216|BEDFORD MIDDLE SC...|01/02/1998| 9:28|Solids, Total Sus...|  null|          <2|     mg/L|                 null|      Non-City Owned|   NY0105741|1998|
|  B-21222|     MAHOPAC STP 

In [6]:
#group by name, geo place name
from pyspark.sql.functions import sum,avg,max,count
gold = gold_WaterQuality.groupBy("Site").count()\
    .show()

+--------------------+-----+
|                Site|count|
+--------------------+-----+
|ERSSG PRE SUBSURF...|    6|
|     EDTO Hydro Sump|   18|
|            SNOWTIME| 1737|
|     PRATTSVILLE WTP|  639|
|   THOMPSON HOUSE CC|   11|
|                 STE| 4606|
|          CARMEL STP| 6630|
|             SEK-T10|  185|
|        BREWSTER STP| 2332|
|      Boiceville WTP| 2040|
|                 EPE| 4654|
|HUNTER HIGHLANDS RAW|  122|
|                  TB|   98|
|                SKTP| 5213|
|MAHOPAC ELEMENTAR...| 4346|
|BEDFORD PARK APAR...| 5774|
|    BATAVIA KILL REC|  187|
|          LAKE SECOR| 5927|
|                STEG| 2338|
|                STLG|  101|
+--------------------+-----+
only showing top 20 rows



In [7]:
# write to delta table
waterDF \
    .select("Site","Date","Analyte","Status","Final_Result","Units","WTP_Group","Year") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_WaterQuality/")

In [8]:
# check the results in the table
spark.table("Trabalho.gold_WaterQuality").show()
spark.table("Trabalho.gold_WaterQuality").count()

+--------------------+----------+--------------------+------+------------+---------+--------------------+----+
|                Site|      Date|             Analyte|Status|Final_Result|    Units|           WTP_Group|Year|
+--------------------+----------+--------------------+------+------------+---------+--------------------+----+
|                PRHE|04/07/1998|                 BOD|  null|         8.2|     mg/L|      Non-City Owned|1998|
|                PRHE|05/05/1998|                  pH|  null|         6.1|     null|      Non-City Owned|1998|
|BEDFORD MIDDLE SC...|01/02/1998|Solids, Total Sus...|  null|          <2|     mg/L|      Non-City Owned|1998|
|     MAHOPAC STP EFF|01/02/1998|Solids, Total Sus...|  null|         1.4|     mg/L|City Owned - Effl...|1998|
|BEDFORD MIDDLE SC...|01/02/1998|                  pH|  null|        6.43|     null|      Non-City Owned|1998|
|    BREWSTER STP EFF|01/02/1998|                  pH|  null|        6.92|     null|      Non-City Owned|1998|
|

619602

In [9]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_WaterQuality/`
""").show()

++
||
++
++



In [14]:
spark.sql("""
DROP TABLE IF EXISTS Trabalho.gold_WaterQuality_presto 
""").show()

++
||
++
++



In [15]:
spark.sql("""
    CREATE EXTERNAL TABLE Trabalho.gold_WaterQuality_presto (
        Sample_Id VARCHAR(50),
        Site VARCHAR(50),
        Date VARCHAR(50),
        Time VARCHAR(50),
        Analyte VARCHAR(50),
        Status VARCHAR(50),
        Final_Result VARCHAR(50),
        Units VARCHAR(50),
        Water_Treatment_Plant VARCHAR(50),
        WTP_Group VARCHAR(50),
        SPDES_Number VARCHAR(50),
        Year INT

    )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_WaterQuality/_symlink_format_manifest'
""").show()

++
||
++
++

