In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/bronze/Children_Under_6_yrs_with_Elevated_Blood_Lead_Levels__BLL_.csv'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()




In [None]:

spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS projeto location 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/projeto.db'
    """
)


#spark.sql(
#    """
#    DROP DATABASE IF EXISTS projeto CASCADE
#    """
#)



In [None]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto.children
    """
)


spark.sql(
    """
    CREATE EXTERNAL TABLE projeto.children (
        geo_type STRING,
        geo_area_id INT,
        geo_area_name STRING,
        borough_id INT,
        time_period INT,
        Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL__Bigger_or_equal_5_ INT,
        Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_10_ INT,
        Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_15_ INT,
        Children_under_6_years_with_elevated_blood_lead_levels__Number_Tested FLOAT,
        Children_under_6_years_with_elevated_blood_lead_levels__Rate__BLL_Bigger_or_equal_5__per_1000_tested FLOAT,
        Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_10__per_1000_tested FLOAT,
        Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_15__per_1000_tested FLOAT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Children/'
    """
)

hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/bronze/Children_Under_6_yrs_with_Elevated_Blood_Lead_Levels__BLL_.csv"

customSchema = StructType([
    StructField('geo_type', StringType(), True),
    StructField('geo_area_id', IntegerType(), True),
    StructField('geo_area_name', StringType(), True),
    StructField('borough_id', IntegerType(), True),
    StructField('time_period', IntegerType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number BLL >=5 Âµg/dL', IntegerType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number BLL >=5 Âµg/dL _NOTES', StringType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number BLL>=10 Âµg/dL', IntegerType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number BLL>=10 Âµg/dL _NOTES', StringType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number BLL>=15 Âµg/dL', IntegerType(), True), 
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number BLL>=15 Âµg/dL _NOTES', StringType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number Tested', FloatType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Number Tested _NOTES', StringType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Rate  BLL>=5 Âµg/dL per 1,000 tested', FloatType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Rate  BLL>=5 Âµg/dL per 1,000 tested_NOTES', StringType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=10 Âµg/dL per 1,000 tested', FloatType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=10 Âµg/dL per 1,000 tested_NOTES', StringType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=15 Âµg/dL per 1,000 tested', FloatType(), True),
    StructField('Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=15 Âµg/dL per 1,000 tested_NOTES', StringType(), True)
])
children = spark \
            .read\
            .option("delimiter", ",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
children.show()
children.printSchema()

In [2]:
replaced_children = children
replaced_children.toPandas()

Unnamed: 0,geo_type,geo_area_id,geo_area_name,borough_id,time_period,Children under 6 years with elevated blood lead levels (BLL) Number BLL >=5 Âµg/dL,Children under 6 years with elevated blood lead levels (BLL) Number BLL >=5 Âµg/dL _NOTES,Children under 6 years with elevated blood lead levels (BLL) Number BLL>=10 Âµg/dL,Children under 6 years with elevated blood lead levels (BLL) Number BLL>=10 Âµg/dL _NOTES,Children under 6 years with elevated blood lead levels (BLL) Number BLL>=15 Âµg/dL,Children under 6 years with elevated blood lead levels (BLL) Number BLL>=15 Âµg/dL _NOTES,Children under 6 years with elevated blood lead levels (BLL) Number Tested,Children under 6 years with elevated blood lead levels (BLL) Number Tested _NOTES,"Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=5 Âµg/dL per 1,000 tested","Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=5 Âµg/dL per 1,000 tested_NOTES","Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=10 Âµg/dL per 1,000 tested","Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=10 Âµg/dL per 1,000 tested_NOTES","Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=15 Âµg/dL per 1,000 tested","Children under 6 years with elevated blood lead levels (BLL) Rate BLL>=15 Âµg/dL per 1,000 tested_NOTES"
0,Borough,1,Bronx,1.0,2005,8245,,595,,167,,64500.0,,127.699997,,9.2,,2.6,
1,Borough,1,Bronx,1.0,2006,7272,,474,,144,,67200.0,,108.199997,,7.1,,2.1,
2,Borough,1,Bronx,1.0,2007,6174,,438,,135,,68300.0,,90.400002,,6.4,,2.0,
3,Borough,1,Bronx,1.0,2008,4254,,292,,105,,69800.0,,60.900002,,4.2,,1.5,
4,Borough,1,Bronx,1.0,2009,2742,,278,,103,,70000.0,,39.200001,,4.0,,1.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,Neighborhood (UHF 42),104,Pelham - Throgs Neck,1.0,2005,1314,,86,,24,,11600.0,,113.199997,,7.4,,2.1,
572,Neighborhood (UHF 42),301,Washington Heights,3.0,2006,1115,,104,,36,,11200.0,,99.800003,,9.3,,3.2,
573,Neighborhood (UHF 42),407,Southwest Queens,4.0,2013,239,,39,,15,,11400.0,,20.900000,,3.4,,1.3,
574,Neighborhood (UHF 42),106,High Bridge - Morrisania,1.0,2013,281,,31,,9,*Estimate is based on small numbers so should ...,11800.0,,23.799999,,2.6,,0.8,*Estimate is based on small numbers so should ...


In [3]:
biggerOrEqualColumns=(column.replace('>=',' Bigger or equal ') for column in replaced_children.columns)
replaced_children2 = replaced_children.toDF(*biggerOrEqualColumns)
BLLColumns=(column.replace('(BLL)','') for column in replaced_children2.columns)
replaced_children2 = replaced_children2.toDF(*BLLColumns)
unitOfMeasurementColumns=(column.replace('Âµg/dL','') for column in replaced_children2.columns)
replaced_children2 = replaced_children2.toDF(*unitOfMeasurementColumns)
commaColumns=(column.replace(',','') for column in replaced_children2.columns)
replaced_children2 = replaced_children2.toDF(*commaColumns)
removeSpaceColumns=(column.replace(' ','_') for column in replaced_children2.columns)
replaced_children2 = replaced_children2.toDF(*removeSpaceColumns)

replaced_children2.toPandas()

Unnamed: 0,geo_type,geo_area_id,geo_area_name,borough_id,time_period,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL__Bigger_or_equal_5_,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL__Bigger_or_equal_5___NOTES,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_10_,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_10___NOTES,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_15_,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_15___NOTES,Children_under_6_years_with_elevated_blood_lead_levels__Number_Tested,Children_under_6_years_with_elevated_blood_lead_levels__Number_Tested__NOTES,Children_under_6_years_with_elevated_blood_lead_levels__Rate__BLL_Bigger_or_equal_5__per_1000_tested,Children_under_6_years_with_elevated_blood_lead_levels__Rate__BLL_Bigger_or_equal_5__per_1000_tested_NOTES,Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_10__per_1000_tested,Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_10__per_1000_tested_NOTES,Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_15__per_1000_tested,Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_15__per_1000_tested_NOTES
0,Borough,1,Bronx,1.0,2005,8245,,595,,167,,64500.0,,127.699997,,9.2,,2.6,
1,Borough,1,Bronx,1.0,2006,7272,,474,,144,,67200.0,,108.199997,,7.1,,2.1,
2,Borough,1,Bronx,1.0,2007,6174,,438,,135,,68300.0,,90.400002,,6.4,,2.0,
3,Borough,1,Bronx,1.0,2008,4254,,292,,105,,69800.0,,60.900002,,4.2,,1.5,
4,Borough,1,Bronx,1.0,2009,2742,,278,,103,,70000.0,,39.200001,,4.0,,1.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,Neighborhood (UHF 42),104,Pelham - Throgs Neck,1.0,2005,1314,,86,,24,,11600.0,,113.199997,,7.4,,2.1,
572,Neighborhood (UHF 42),301,Washington Heights,3.0,2006,1115,,104,,36,,11200.0,,99.800003,,9.3,,3.2,
573,Neighborhood (UHF 42),407,Southwest Queens,4.0,2013,239,,39,,15,,11400.0,,20.900000,,3.4,,1.3,
574,Neighborhood (UHF 42),106,High Bridge - Morrisania,1.0,2013,281,,31,,9,*Estimate is based on small numbers so should ...,11800.0,,23.799999,,2.6,,0.8,*Estimate is based on small numbers so should ...


In [4]:
replaced_children3 = replaced_children2.drop("Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL__Bigger_or_equal_5___NOTES")
replaced_children3 = replaced_children3.drop("Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_10___NOTES")
replaced_children3 = replaced_children3.drop("Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_15___NOTES")
replaced_children3 = replaced_children3.drop("Children_under_6_years_with_elevated_blood_lead_levels__Number_Tested__NOTES")
replaced_children3 = replaced_children3.drop("Children_under_6_years_with_elevated_blood_lead_levels__Rate__BLL_Bigger_or_equal_5__per_1000_tested_NOTES")
replaced_children3 = replaced_children3.drop("Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_10__per_1000_tested_NOTES")
replaced_children3 = replaced_children3.drop("Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_15__per_1000_tested_NOTES")
replaced_children3.toPandas()

Unnamed: 0,geo_type,geo_area_id,geo_area_name,borough_id,time_period,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL__Bigger_or_equal_5_,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_10_,Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_15_,Children_under_6_years_with_elevated_blood_lead_levels__Number_Tested,Children_under_6_years_with_elevated_blood_lead_levels__Rate__BLL_Bigger_or_equal_5__per_1000_tested,Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_10__per_1000_tested,Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_15__per_1000_tested
0,Borough,1,Bronx,1.0,2005,8245,595,167,64500.0,127.699997,9.2,2.6
1,Borough,1,Bronx,1.0,2006,7272,474,144,67200.0,108.199997,7.1,2.1
2,Borough,1,Bronx,1.0,2007,6174,438,135,68300.0,90.400002,6.4,2.0
3,Borough,1,Bronx,1.0,2008,4254,292,105,69800.0,60.900002,4.2,1.5
4,Borough,1,Bronx,1.0,2009,2742,278,103,70000.0,39.200001,4.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...
571,Neighborhood (UHF 42),104,Pelham - Throgs Neck,1.0,2005,1314,86,24,11600.0,113.199997,7.4,2.1
572,Neighborhood (UHF 42),301,Washington Heights,3.0,2006,1115,104,36,11200.0,99.800003,9.3,3.2
573,Neighborhood (UHF 42),407,Southwest Queens,4.0,2013,239,39,15,11400.0,20.900000,3.4,1.3
574,Neighborhood (UHF 42),106,High Bridge - Morrisania,1.0,2013,281,31,9,11800.0,23.799999,2.6,0.8


In [18]:
#write df to hive deltalake_table
replaced_children3 \
    .select("geo_type","geo_area_id","geo_area_name","borough_id","time_period",
            "Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL__Bigger_or_equal_5_",
            "Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_10_",
            "Children_under_6_years_with_elevated_blood_lead_levels__Number_BLL_Bigger_or_equal_15_",
            "Children_under_6_years_with_elevated_blood_lead_levels__Number_Tested",
            "Children_under_6_years_with_elevated_blood_lead_levels__Rate__BLL_Bigger_or_equal_5__per_1000_tested",
            "Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_10__per_1000_tested",
            "Children_under_6_years_with_elevated_blood_lead_levels__Rate_BLL_Bigger_or_equal_15__per_1000_tested") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Children/")
from pyspark.sql.types import *

In [5]:
spark.sql("USE trabalho")
spark.sql("SHOW tables").show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
| trabalho|     gold_airquality|      false|
| trabalho|gold_airquality_p...|      false|
| trabalho|      gold_recycling|      false|
| trabalho|gold_recycling_pr...|      false|
| trabalho|   gold_waterquality|      false|
| trabalho|gold_waterquality...|      false|
| trabalho|      qualidade_agua|      false|
| trabalho|        qualidade_ar|      false|
| trabalho|          reciclagem|      false|
+---------+--------------------+-----------+

