In [13]:
 pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [3]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
spark.sql(
    """
    create database Trabalho location 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db'
    """
)


AnalysisException: Namespace 'Trabalho' already exists

In [3]:

spark.sql(
   """
    DROP DATABASE IF EXISTS Trabalho CASCADE
   """
)

spark.sql(
    """
   DROP TABLE IF EXISTS Trabalho.Air_Quality
   """
)


DataFrame[]

In [6]:
hdfs_path = "hdfs://hdfs-nn:9000/Qualidade_NYC/bronze/Air_Quality.csv"

customSchema = StructType([
    StructField("Unique_ID", StringType(), True),
    StructField("Indicator ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Measure", StringType(), True),
    StructField("Measure Info", StringType(), True),
    StructField("Geo Type Name", StringType(), True),
    StructField("Geo Join ID", StringType(), True),
    StructField("Geo Place Name", StringType(), True), 
    StructField("Time Period", StringType(), True),
    StructField("Start_Date", StringType(), True),
    StructField("Data Value", FloatType(), True), 
    StructField("Message", StringType(), True)
])
air_quality = spark \
            .read\
            .option("delimiter",";")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
air_quality.show()
air_quality.printSchema()

+---------+------------+--------------------+-------+------------+-------------+-----------+--------------------+--------------+----------+----------+-------+
|Unique_ID|Indicator ID|                Name|Measure|Measure Info|Geo Type Name|Geo Join ID|      Geo Place Name|   Time Period|Start_Date|Data Value|Message|
+---------+------------+--------------------+-------+------------+-------------+-----------+--------------------+--------------+----------+----------+-------+
|   216498|         386|          Ozone (O3)|   Mean|         ppb|           CD|        313| Coney Island (CD13)|   Summer 2013|06/01/2013|     34.64|   null|
|   216499|         386|          Ozone (O3)|   Mean|         ppb|           CD|        313| Coney Island (CD13)|   Summer 2014|06/01/2014|     33.22|   null|
|   219969|         386|          Ozone (O3)|   Mean|         ppb|      Borough|          1|               Bronx|   Summer 2013|06/01/2013|     31.25|   null|
|   219970|         386|          Ozone (O3)| 

In [7]:
replaced_air_quality = air_quality.drop("Message")
replaced_air_quality.toPandas()

Unnamed: 0,Unique_ID,Indicator ID,Name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value
0,216498,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2013,06/01/2013,34.639999
1,216499,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2014,06/01/2014,33.220001
2,219969,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2013,06/01/2013,31.250000
3,219970,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2014,06/01/2014,31.150000
4,164876,383,Sulfur Dioxide (SO2),Mean,ppb,CD,211,Morris Park and Bronxdale (CD11),Winter 2008-09,12/01/2008,5.890000
...,...,...,...,...,...,...,...,...,...,...,...
16117,671118,386,Ozone (O3),Mean,ppb,CD,306,Park Slope and Carroll Gardens (CD6),Summer 2020,06/01/2020,28.700001
16118,671119,386,Ozone (O3),Mean,ppb,CD,305,East New York and Starrett City (CD5),Summer 2020,06/01/2020,29.559999
16119,671120,386,Ozone (O3),Mean,ppb,CD,304,Bushwick (CD4),Summer 2020,06/01/2020,29.650000
16120,671121,386,Ozone (O3),Mean,ppb,CD,303,Bedford Stuyvesant (CD3),Summer 2020,06/01/2020,29.280001


In [8]:
replaced_air_quality2 = replaced_air_quality.withColumn('Year', split(replaced_air_quality['Start_Date'],'/').getItem(2))
replaced_air_quality2.toPandas()


Unnamed: 0,Unique_ID,Indicator ID,Name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value,Year
0,216498,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2013,06/01/2013,34.639999,2013
1,216499,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2014,06/01/2014,33.220001,2014
2,219969,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2013,06/01/2013,31.250000,2013
3,219970,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2014,06/01/2014,31.150000,2014
4,164876,383,Sulfur Dioxide (SO2),Mean,ppb,CD,211,Morris Park and Bronxdale (CD11),Winter 2008-09,12/01/2008,5.890000,2008
...,...,...,...,...,...,...,...,...,...,...,...,...
16117,671118,386,Ozone (O3),Mean,ppb,CD,306,Park Slope and Carroll Gardens (CD6),Summer 2020,06/01/2020,28.700001,2020
16118,671119,386,Ozone (O3),Mean,ppb,CD,305,East New York and Starrett City (CD5),Summer 2020,06/01/2020,29.559999,2020
16119,671120,386,Ozone (O3),Mean,ppb,CD,304,Bushwick (CD4),Summer 2020,06/01/2020,29.650000,2020
16120,671121,386,Ozone (O3),Mean,ppb,CD,303,Bedford Stuyvesant (CD3),Summer 2020,06/01/2020,29.280001,2020


In [9]:
NewColumns=(column.replace(' ','_') for column in replaced_air_quality2.columns)
replaced_air_quality3 = replaced_air_quality2.toDF(*NewColumns)
replaced_air_quality3.toPandas()

Unnamed: 0,Unique_ID,Indicator_ID,Name,Measure,Measure_Info,Geo_Type_Name,Geo_Join_ID,Geo_Place_Name,Time_Period,Start_Date,Data_Value,Year
0,216498,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2013,06/01/2013,34.639999,2013
1,216499,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2014,06/01/2014,33.220001,2014
2,219969,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2013,06/01/2013,31.250000,2013
3,219970,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2014,06/01/2014,31.150000,2014
4,164876,383,Sulfur Dioxide (SO2),Mean,ppb,CD,211,Morris Park and Bronxdale (CD11),Winter 2008-09,12/01/2008,5.890000,2008
...,...,...,...,...,...,...,...,...,...,...,...,...
16117,671118,386,Ozone (O3),Mean,ppb,CD,306,Park Slope and Carroll Gardens (CD6),Summer 2020,06/01/2020,28.700001,2020
16118,671119,386,Ozone (O3),Mean,ppb,CD,305,East New York and Starrett City (CD5),Summer 2020,06/01/2020,29.559999,2020
16119,671120,386,Ozone (O3),Mean,ppb,CD,304,Bushwick (CD4),Summer 2020,06/01/2020,29.650000,2020
16120,671121,386,Ozone (O3),Mean,ppb,CD,303,Bedford Stuyvesant (CD3),Summer 2020,06/01/2020,29.280001,2020


In [10]:

spark.sql(
    """
    CREATE EXTERNAL TABLE Trabalho.Qualidade_Ar (
        Unique_ID VARCHAR(50),
        Indicator_ID INT,
        Name VARCHAR(50),
        Measure VARCHAR(50),
        Measure_Info VARCHAR(50),
        Geo_Type_Name VARCHAR(50),
        Geo_Join_ID VARCHAR(50),
        Geo_Place_Name VARCHAR(50),
        Time_Period VARCHAR(50),
        Start_Date VARCHAR(50),
        Data_Value FLOAT
    )
    USING DELTA
   
   PARTITIONED BY (
        Year INT

    )
    LOCATION 'hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Qualidade_Ar'
    """
)

DataFrame[]

In [11]:
#write df to hive deltalake_table
replaced_air_quality3\
    .select("Unique_ID","Indicator_ID","Name","Measure","Measure_Info","Geo_Type_Name",
            "Geo_Join_ID","Geo_Place_Name","Time_Period","Start_Date","Data_Value","Year") \
    .write \
    .mode("overwrite") \
    .partitionBy("Year") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Qualidade_NYC/silver/Projeto.db/Qualidade_Ar/deltalake_table/")
from pyspark.sql.types import *

In [12]:
spark.sql("USE Trabalho")
spark.sql("SHOW tables").show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
| trabalho|gold_recycling|      false|
| trabalho|  qualidade_ar|      false|
| trabalho|    reciclagem|      false|
+---------+--------------+-----------+

