In [1]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [24]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [25]:
# create Reciclagem table in gold

spark.sql(
    """
    DROP TABLE IF EXISTS Trabalho.gold_Recycling
    """
)

DataFrame[]

In [26]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Trabalho.gold_Recycling (
        Zone STRING,
        Fiscal_Year INT,
        Month_Name STRING,
        Diversion_Rate_Total FLOAT,
        Capture_Rate_Paper FLOAT,
        Capture_Rate_MGP FLOAT,
        Capture_Rate_Total FLOAT
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Recycling/'
    """
)

DataFrame[]

In [27]:
from pyspark.sql.functions import substring, avg, sum

# read Recycling from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Reciclagem"

gold_Recycling = spark\
             .read\
             .load(hdfs_path)

gold_Recycling.show()
gold_Recycling.printSchema()

+--------------+--------+-------------------+----------+--------------------+------------------+----------------+------------------+-----------+
|          Zone|District|Fiscal_Month_Number|Month_Name|Diversion_Rate_Total|Capture_Rate_Paper|Capture_Rate_MGP|Capture_Rate_Total|Fiscal_Year|
+--------------+--------+-------------------+----------+--------------------+------------------+----------------+------------------+-----------+
|Brooklyn North|   BKN01|                 10|     April|                13.7|              41.0|            44.3|              42.3|       2016|
|Brooklyn North|   BKN02|                 10|     April|                17.6|              33.2|            55.4|              39.8|       2016|
|Brooklyn North|   BKN03|                 10|     April|                10.9|              30.5|            39.3|              34.1|       2016|
|Brooklyn North|   BKN04|                 10|     April|                13.4|              30.3|            59.3|              42.

In [28]:
#group by name, geo place name
from pyspark.sql.functions import sum,avg,max,count
gold = gold_Recycling.groupBy("Zone","Fiscal_Year","Diversion_Rate_Total",).count()\
    .show()

+--------------+-----------+--------------------+-----+
|          Zone|Fiscal_Year|Diversion_Rate_Total|count|
+--------------+-----------+--------------------+-----+
|   Queens East|       2016|                17.8|    1|
|         Bronx|       2016|                16.9|    1|
|Brooklyn South|       2016|                20.1|    2|
|Brooklyn North|       2016|                17.5|    1|
|   Queens West|       2017|                21.2|    2|
|Brooklyn North|       2017|                11.0|    1|
|   Queens West|       2017|                24.0|    1|
|Brooklyn North|       2017|                14.5|    1|
|         Bronx|       2017|                15.8|    1|
|         Bronx|       2016|                11.9|    1|
|     Manhattan|       2016|                17.1|    2|
|     Manhattan|       2017|                11.5|    1|
|     Manhattan|       2016|                24.7|    1|
| Staten Island|       2016|                19.2|    1|
|Brooklyn North|       2016|                10.9

In [29]:
# write to delta table
gold_Recycling \
    .select("Zone","Month_Name","Diversion_Rate_Total","Capture_Rate_Paper","Capture_Rate_MGP","Capture_Rate_Total","Fiscal_Year") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Recycling/")

In [30]:
# check the results in the table
spark.table("Trabalho.gold_Recycling ").show()
spark.table("Trabalho.gold_Recycling ").count()

+--------------+-----------+----------+--------------------+------------------+----------------+------------------+
|          Zone|Fiscal_Year|Month_Name|Diversion_Rate_Total|Capture_Rate_Paper|Capture_Rate_MGP|Capture_Rate_Total|
+--------------+-----------+----------+--------------------+------------------+----------------+------------------+
|Brooklyn North|       2016|     April|                13.7|              41.0|            44.3|              42.3|
|Brooklyn North|       2016|     April|                17.6|              33.2|            55.4|              39.8|
|Brooklyn North|       2016|     April|                10.9|              30.5|            39.3|              34.1|
|Brooklyn North|       2016|     April|                13.4|              30.3|            59.3|              42.1|
|Brooklyn North|       2016|     April|                 9.6|              22.4|            41.6|              30.2|
|Brooklyn North|       2016|     April|                15.2|            

2832

In [31]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Recycling/`
""").show()

++
||
++
++



In [32]:
spark.sql("""
DROP TABLE IF EXISTS Trabalho.gold_Recycling_presto 
""").show()


++
||
++
++



In [36]:
spark.sql("""
    CREATE EXTERNAL TABLE Trabalho.gold_Recycling_presto (
        Zone STRING,
        Fiscal_Year INT,
        Month_Name STRING,
        Diversion_Rate_Total FLOAT,
        Capture_Rate_Paper FLOAT,
        Capture_Rate_MGP FLOAT,
        Capture_Rate_Total FLOAT
        
    )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/gold/Projeto.db/gold_Recycling/_symlink_format_manifest'
""").show()

++
||
++
++

