In [68]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [69]:
spark.sql(
    """
    DROP TABLE IF EXISTS Trabalho.gold_Diseases
    """
)

DataFrame[]

In [70]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Trabalho.gold_Diseases (
        County_Name STRING,
        Event_Count INT,
        Average_Number_of_Denominator INT,
        Percent_Rate FLOAT,
        Quartile STRING,
        Data_Years STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Diseases/'
    """
)

DataFrame[]

In [72]:
from pyspark.sql.functions import substring, avg, sum

# read Diseases from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Doencas"

gold_Diseases = spark\
             .read\
             .load(hdfs_path)

gold_Diseases.show()
gold_Diseases.printSchema()

+-----------+-------------------+-----------------+----------------+--------------------+-----------+-----------------------------+------------+------------+--------------------+----------+--------------------+--------------------+--------------------+
|County_Name|Health_Topic_Number|     Health_Topic|Indicator_Number|           Indicator|Event_Count|Average_Number_of_Denominator|Measure_Unit|Percent_Rate|            Quartile|Data_Years|         Data_Source|Mapping_Distribution|            Location|
+-----------+-------------------+-----------------+----------------+--------------------+-----------+-----------------------------+------------+------------+--------------------+----------+--------------------+--------------------+--------------------+
|     Cayuga|                  1|Cancer Indicators|              g7|Lung and bronchus...|        266|                        78805|        Rate|       112.5|      106.4 +   : Q4| 2013-2015|Cancer Registry D...|                   3|(42.940095

In [73]:
# write to delta table
gold_Diseases \
    .select("County_Name","Event_Count","Average_Number_of_Denominator","Percent_Rate","Quartile","Data_Years") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Diseases/")

In [74]:
# check the results in the table
spark.table("Trabalho.gold_Diseases ").show()
spark.table("Trabalho.gold_Diseases ").count()

+-----------+-----------+-----------------------------+------------+--------------------+----------+
|County_Name|Event_Count|Average_Number_of_Denominator|Percent_Rate|            Quartile|Data_Years|
+-----------+-----------+-----------------------------+------------+--------------------+----------+
|     Cayuga|        266|                        78805|       112.5|      106.4 +   : Q4| 2013-2015|
|  Jefferson|        305|                       118365|        85.9|0 - <   94.0  : Q...| 2013-2015|
| Chautauqua|        362|                       131932|        91.5|0 - <   94.0  : Q...| 2013-2015|
|      Wayne|        277|                        91854|       100.5|94.0 - <  106.4  ...| 2013-2015|
|     Monroe|       1691|                       749835|        75.2|0 - <   94.0  : Q...| 2013-2015|
|      Bronx|       1900|                      1437811|        44.0|0 - <   94.0  : Q...| 2013-2015|
|  Schoharie|        101|                        31645|       106.4|94.0 - <  106.4  ...| 2

62

In [75]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Diseases/`
""").show()

++
||
++
++



In [76]:
spark.sql("""
DROP TABLE IF EXISTS Trabalho.gold_Diseases_presto 
""").show()

++
||
++
++



In [77]:
spark.sql("""
    CREATE EXTERNAL TABLE Trabalho.gold_Diseases_presto (
        County_Name STRING,
        Event_Count INT,
        Average_Number_of_Denominator INT,
        Percent_Rate FLOAT,
        Quartile STRING,
        Data_Years STRING
     )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Diseases/_symlink_format_manifest'
""").show()

++
||
++
++

