In [2]:
pip install delta-spark


Note: you may need to restart the kernel to use updated packages.


In [13]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [16]:
spark.sql(
    """
    create database Trabalho location 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db'
    """
)

DataFrame[]

In [15]:
spark.sql(
    """
    DROP DATABASE IF EXISTS Trabalho CASCADE
    """
)

spark.sql(
    """
    DROP TABLE IF EXISTS Trabalho.Reciclagem
    """
)

DataFrame[]

In [17]:
#read hdfs file to dataframe
#
hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/bronze/Recycling_Diversion_and_Capture_Rates.csv"
#define the schema for the dataframe
customSchema = StructType([
    StructField("Zone", StringType(), True),
    StructField("District", StringType(), True), 
    StructField("Fiscal Month Number", IntegerType(), True),
    StructField("Fiscal Year", IntegerType(), True),
    StructField("Month Name", StringType(), True),
    StructField("Diversion Rate-Total (Total Recycling / Total Waste)", FloatType(), True),        
    StructField("Capture Rate-Paper (Total Paper / Max Paper)", FloatType(), True),
    StructField("Capture Rate-MGP (Total MGP / Max MGP)", FloatType(), True),
    StructField("Capture Rate-Total ((Total Recycling - Leaves (Recycling)) / (Max Paper + Max MGP))x100", FloatType(), True),
      
    
])

sales_df = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
sales_df.show()
sales_df.printSchema()

+--------------+--------+-------------------+-----------+----------+----------------------------------------------------+--------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------+
|          Zone|District|Fiscal Month Number|Fiscal Year|Month Name|Diversion Rate-Total (Total Recycling / Total Waste)|Capture Rate-Paper (Total Paper / Max Paper)|Capture Rate-MGP (Total MGP / Max MGP)|Capture Rate-Total ((Total Recycling - Leaves (Recycling)) / (Max Paper + Max MGP))x100|
+--------------+--------+-------------------+-----------+----------+----------------------------------------------------+--------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------+
|Brooklyn North|   BKN01|                 10|       2019|     April|                                                14

In [18]:
sales_df2 = sales_df.withColumnRenamed("Diversion Rate-Total (Total Recycling / Total Waste)","Diversion_Rate_Total") \
    .withColumnRenamed("Capture Rate-Paper (Total Paper / Max Paper)","Capture_Rate_Paper")\
    .withColumnRenamed("Capture Rate-MGP (Total MGP / Max MGP)","Capture_Rate_MGP")\
    .withColumnRenamed("Fiscal Month Number","Fiscal_Month_Number")\
    .withColumnRenamed("Fiscal Year","Fiscal_Year")\
    .withColumnRenamed("Month Name","Month_Name")\
    .withColumnRenamed("Capture Rate-Total ((Total Recycling - Leaves (Recycling)) / (Max Paper + Max MGP))x100","Capture_Rate_Total")
sales_df2.toPandas()

Unnamed: 0,Zone,District,Fiscal_Month_Number,Fiscal_Year,Month_Name,Diversion_Rate_Total,Capture_Rate_Paper,Capture_Rate_MGP,Capture_Rate_Total
0,Brooklyn North,BKN01,10,2019,April,14.700000,44.900002,43.000000,44.099998
1,Brooklyn North,BKN02,10,2019,April,20.000000,34.200001,57.900002,41.200001
2,Brooklyn North,BKN03,10,2019,April,12.200000,33.500000,44.900002,38.200001
3,Brooklyn North,BKN04,10,2019,April,15.500000,35.200001,68.500000,48.799999
4,Brooklyn North,BKN05,10,2019,April,10.100000,22.299999,45.099998,31.500000
...,...,...,...,...,...,...,...,...,...
2827,Queens West,QW06,3,2016,September,20.100000,30.400000,68.000000,39.000000
2828,Queens West,QW09,3,2016,September,17.400000,41.099998,79.699997,54.299999
2829,Staten Island,SI01,3,2016,September,18.700001,39.500000,71.699997,49.700001
2830,Staten Island,SI02,3,2016,September,19.000000,44.500000,75.000000,54.099998


In [19]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Trabalho.Reciclagem (
        Zone VARCHAR(50),
        District VARCHAR(50),
        Fiscal_Month_Number INT,
        Month_Name VARCHAR(50),
        Diversion_Rate_Total FLOAT,
        Capture_Rate_Paper FLOAT,
        Capture_Rate_MGP FLOAT,
        Capture_Rate_Total FLOAT
       

    )
       USING DELTA
   
   PARTITIONED BY (
        Fiscal_Year INT

    )
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Reciclagem'
    """
)

DataFrame[]

In [20]:
#write df to hive deltalake_table
sales_df2 \
    .select("Zone","District","Fiscal_Month_Number","Fiscal_Year","Month_Name","Diversion_Rate_Total",
            "Capture_Rate_Paper","Capture_Rate_MGP","Capture_Rate_Total") \
    .write \
    .mode("overwrite") \
    .partitionBy("Fiscal_Year") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Reciclagem/deltalake_table/")
from pyspark.sql.types import *


In [21]:
spark.sql("USE Trabalho")
spark.sql("SHOW tables").show()

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
| trabalho|reciclagem|      false|
+---------+----------+-----------+

