In [1]:
 pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark.sql(
    """
    create database Projeto location 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db'
    """
)


AnalysisException: Namespace 'Projeto' already exists

In [None]:
#spark.sql(
#   """
#   DROP DATABASE IF EXISTS Trabalho CASCADE
#   """
#)

spark.sql(
    """
    DROP TABLE IF EXISTS Trabalho.Doencas
    """
)

In [None]:
hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/bronze/Community_Health_Lung_and_Bronchus_Cancer_Incidence_Rate_per_100_000_by_County_Map_Latest_Data.csv"

customSchema = StructType([
    StructField("County Name", StringType(), True),
    StructField("Health Topic Number", IntegerType(), True),
    StructField("Health Topic", StringType(), True),
    StructField("Indicator Number", StringType(), True),
    StructField("Indicator", StringType(), True),
    StructField("Event Count", IntegerType(), True),
    StructField("Average Number of Denominator", IntegerType(), True),
    StructField("Measure Unit", StringType(), True),
    StructField("Percent/Rate", FloatType(), True),
    StructField("Lower Limit of 95% CI", FloatType(), True), 
    StructField("Upper Limit of 95% CI", FloatType(), True),
    StructField("Data Comments", StringType(), True),
    StructField("Quartile", StringType(), True),
    StructField("Data Years", StringType(), True),
    StructField("Data Source", StringType(), True),
    StructField("Mapping Distribution", IntegerType(), True),
    StructField("Location", StringType(), True)
])
doencas = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
doencas.show()
doencas.printSchema()

In [None]:
replaced_doencas = doencas.drop("Lower Limit of 95% CI", "Upper Limit of 95% CI", "Data Comments" )
replaced_doencas.toPandas()

In [None]:
NewColumns=(column.replace(' ','_') for column in replaced_doencas.columns)
replaced_doencas2 = replaced_doencas.toDF(*NewColumns)

replaced_doencas3 = replaced_doencas2.withColumnRenamed("Percent/Rate","Percent_Rate") 
replaced_doencas3.toPandas()

In [None]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Trabalho.Doencas (
        County_Name STRING,
        Health_Topic_Number INT,
        Health_Topic STRING,
        Indicator_Number STRING,
        Indicator STRING,
        Event_Count INT,
        Average_Number_of_Denominator INT,
        Measure_Unit STRING,
        Percent_Rate FLOAT,
        Quartile STRING,
        Data_Years STRING,
        Data_Source STRING,
        Mapping_Distribution INT,
        Location STRING    
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Doencas'
    """
)

In [None]:
#write df to hive deltalake_table
replaced_doencas3\
    .select("County_Name","Health_Topic_Number","Health_Topic","Indicator_Number","Indicator","Event_Count",
            "Average_Number_of_Denominator","Measure_Unit","Percent_Rate","Quartile","Data_Years","Data_Source",
            "Mapping_Distribution","Location") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Doencas/")
from pyspark.sql.types import *

In [None]:
spark.sql("USE Trabalho")
spark.sql("SHOW tables").show()