In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
#===========================CRIAÇÃO DA DATABASE===========================

In [3]:
spark.sql(
    """
    DROP DATABASE IF EXISTS Projeto CASCADE
    """
)

DataFrame[]

In [4]:
spark.sql(
    """
    CREATE DATABASE Projeto LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/'
    """
)

DataFrame[]

In [5]:
#===========================================CRIAÇÃO DAS TABELAS============================================

In [6]:
#----------------------------------------------HOMICIDE_DATASET----------------------------------------

In [7]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Homicide_Victim_table
""")

DataFrame[]

In [8]:
spark.sql("""
    CREATE TABLE Projeto.Homicide_Victim_table (
        Homicide_Victim_Id STRING,
        Count_of_Victims INT,
        Age_Group STRING,
        Victim_Gender STRING,
        Method_of_Killing STRING, 
        Domestic_Abuse STRING,
        Recorded_Date STRING, 
        Homicide_Offence_Type STRING,
        Solved_Status STRING, 
        Borough STRING, 
        Officer_Observed_Ethnicity STRING, 
        Recorded_Month INT, 
        Recorded_Year INT,
        Location STRING,
        Uniform_Age_Group STRING
    )
    USING DELTA
    PARTITIONED BY (Borough) 
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Homicide_Victim_table'
""")


DataFrame[]

In [9]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Homicide_Proceeded_table
""")

DataFrame[]

In [10]:
spark.sql("""
    CREATE TABLE Projeto.Homicide_Proceeded_table (
        Homicide_Proceeded_ID STRING,
        People_Accused_Count INT,
        Borough STRING,
        Age_Group STRING,
        Agressor_Gender STRING,
        Self_Classified_Ethnicity STRING,
        Offence_Type STRING,
        Charged_Summonsed STRING,
        Proceedings_Date STRING,
        Proceedings_Month INT,
        Proceedings_Year INT, 
        Location STRING, 
        Uniform_Age_Group STRING
    )
    USING DELTA
    PARTITIONED BY (Borough) 
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Homicide_Proceeded_table'
""")

DataFrame[]

In [11]:
#----------------------------------------------ETAPA GOLD----------------------------------------

In [12]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Person_table
""")

DataFrame[]

In [13]:
spark.sql("""
    CREATE TABLE Projeto.Person_table (
        Id_Person INT,
        Gender STRING,
        Age_Group STRING,
        Ethinicity STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Person_table'
""")

DataFrame[]

In [14]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Date_table
""")

DataFrame[]

In [15]:
spark.sql("""
    CREATE TABLE Projeto.Date_table (
        Id_Date INT,
        Day INT,
        Month INT,
        Year INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Date_table'
""")

DataFrame[]

In [16]:
spark.sql("""
    DROP TABLE IF EXISTS Location_table
""")

DataFrame[]

In [17]:
spark.sql("""
    CREATE TABLE Projeto.Location_table (
        Id_Location INT,
        Location STRING,
        Borough STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Location_table'
""")

DataFrame[]

In [18]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Homicides_table
""")

DataFrame[]

In [19]:
spark.sql("""
    CREATE TABLE Projeto.Homicides_table (
        Id_Homicide STRING,
        Method_Of_Killing STRING,
        Offense_Type STRING,
        Solved_Status STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Homicides_table'
""")

DataFrame[]

In [20]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Homicide_Facts_table
""")

DataFrame[]

In [21]:
spark.sql("""
    CREATE TABLE Projeto.Homicide_Facts_table (
        Id_Homicide_Fact STRING,
        Id_Pessoa INT,
        Id_Data INT,
        Id_Location INT,
        Id_Crime_Type INT,
            
        Count_of_Victims INT, 
        Percentage_of_Total_Victims DOUBLE
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Homicide_Facts_table'
""")

DataFrame[]

In [22]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Proceeded_Facts_table
""")

DataFrame[]

In [23]:
spark.sql("""
    CREATE TABLE Projeto.Proceeded_Facts_table (
        Id_Proceeded_Fact STRING,
        Id_Pessoa INT,
        Id_Data INT,
        Id_Location INT,
        Id_Crime_Type INT,
        Offense_Type STRING,
        Charged_Summonsed STRING,
        
        People_Accused_Count INT,
        Percentage_of_Total_Proceeds DOUBLE
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Proceeded_Facts_table'
""")

DataFrame[]

In [24]:
#----------------------------------------------CRIME_DATASET----------------------------------------

In [25]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Outcomes_table
""")

DataFrame[]

In [26]:
spark.sql("""
    CREATE TABLE Projeto.Outcomes_table (
        CrimeID STRING,
        Reported_by STRING,
        Falls_within STRING,
        Longitude STRING,
        Latitude STRING,
        Near_of STRING,
        LSOA_code STRING,
        LSOA_name STRING,
        Outcome_type STRING,
        Year INT,
        Month STRING,
        Location STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Outcomes_table'
""")




DataFrame[]

In [27]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Stop_and_search_table
""")

DataFrame[]

In [28]:
spark.sql("""
    CREATE TABLE Projeto.Stop_and_search_table (
        Type STRING,
        Date STRING,
        STOP_Operation STRING,
        Policing_operation STRING,
        Latitude STRING,
        Longitude STRING,
        Gender STRING,
        Age_range STRING,
        Self_defined_ethnicity STRING,
        Officer_defined_ethnicity STRING,
        Legislation STRING,
        Object_of_search STRING,
        Outcome STRING,
        Outcome_of_Searched_object STRING,
        Removal_of_more_than_just_outer_clothing STRING,
        Year INT,
        Month INT,
        Age_Group_agrouped STRING,
        Location STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Stop_and_search_table'
""")


DataFrame[]

In [29]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Street_table
""")

DataFrame[]

In [30]:
spark.sql("""
    CREATE TABLE Projeto.Street_table (
        CrimeID STRING,
        Time STRING,
        Reported_by STRING,
        Falls_within STRING,
        Longitude STRING,
        Latitude STRING,
        Near_of STRING,
        LSOA_code STRING,
        LSOA_name STRING,
        Crime_type STRING,
        Last_outcome_category STRING,
        Context STRING,
        Year INT,
        Month INT,
        Location STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Street_table'
""")


DataFrame[]

In [31]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Gender_Stats_Data_table
""")

DataFrame[]

In [32]:
#-----------------------------------------------------ETAPA GOLD-------------------------------------------------

In [33]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.dim_pessoa_table
""")

DataFrame[]

In [34]:
spark.sql("""
    CREATE TABLE Projeto.dim_data (
        id_data INT,
        Data DATE,
        Year INT,
        Month INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_data'
""")

DataFrame[]

In [35]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.local_table
""")

DataFrame[]

In [36]:
spark.sql("""
    CREATE TABLE Projeto.dim_local (
        id_local INT,
        Latitude STRING,
        Longitude STRING,
        Location STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_local'
""")

DataFrame[]

In [37]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.pessoa_table
""")

DataFrame[]

In [38]:
spark.sql("""
    CREATE TABLE Projeto.dim_pessoa (
        id_pessoa INT,
        Gender STRING,
        Age_category STRING,
        Self_defined_ethnicity STRING,
        Officer_defined_ethnicity STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_pessoa'
""")


DataFrame[]

In [39]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.operation_table
""")

DataFrame[]

In [40]:
spark.sql("""
    CREATE TABLE Projeto.dim_operation (
        id_operation INT,
        Type STRING,
        STOP_Operation STRING,
        Policing_operation STRING,
        Legislation STRING,
        Object_of_search STRING,
        Outcome STRING,
        Outcome_of_Searched_object STRING,
        Removal_of_more_than_just_outer_clothing STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_operation'
""")

DataFrame[]

In [41]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.factos_table
""")

DataFrame[]

In [42]:
spark.sql("""
    CREATE TABLE Projeto.factos (
        id_data INT,
        id_local INT,
        id_pessoa INT,
        id_operation INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/factos'
""")

DataFrame[]

In [43]:
#=================================================== GENDER STATS ===================================================

In [44]:
spark.sql("""
    CREATE TABLE Projeto.Gender_Stats_Data_table (
        Gender_Stats_Data_ID STRING,
        Country_Region_Situation_Name STRING,
        Country_Region_Situation_Code STRING,
        Indicator_Name STRING,
        Indicator_Code STRING,
        Gender STRING,
        Uniform_Age_Group STRING,
        `1960` FLOAT,
        `1961` FLOAT,
        `1962` FLOAT,
        `1963` FLOAT,
        `1964` FLOAT,
        `1965` FLOAT,
        `1966` FLOAT,
        `1967` FLOAT,
        `1968` FLOAT,
        `1969` FLOAT,
        `1970` FLOAT,
        `1971` FLOAT,
        `1972` FLOAT,
        `1973` FLOAT,
        `1974` FLOAT,
        `1975` FLOAT,
        `1976` FLOAT,
        `1977` FLOAT,
        `1978` FLOAT,
        `1979` FLOAT,
        `1980` FLOAT,
        `1981` FLOAT,
        `1982` FLOAT,
        `1983` FLOAT,
        `1984` FLOAT,
        `1985` FLOAT,
        `1986` FLOAT,
        `1987` FLOAT,
        `1988` FLOAT,
        `1989` FLOAT,
        `1990` FLOAT,
        `1991` FLOAT,
        `1992` FLOAT,
        `1993` FLOAT,
        `1994` FLOAT,
        `1995` FLOAT,
        `1996` FLOAT,
        `1997` FLOAT,
        `1998` FLOAT,
        `1999` FLOAT,
        `2000` FLOAT,
        `2001` FLOAT,
        `2002` FLOAT,
        `2003` FLOAT,
        `2004` FLOAT,
        `2005` FLOAT,
        `2006` FLOAT,
        `2007` FLOAT,
        `2008` FLOAT,
        `2009` FLOAT,
        `2010` FLOAT,
        `2011` FLOAT,
        `2012` FLOAT,
        `2013` FLOAT,
        `2014` FLOAT,
        `2015` FLOAT,
        `2016` FLOAT,
        `2017` FLOAT,
        `2018` FLOAT,
        `2019` FLOAT,
        `2020` FLOAT,
        `2021` FLOAT,
        `2022` FLOAT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Gender_Stats_Data_table'
""")


DataFrame[]

In [45]:
spark.sql("""
    DROP TABLE IF EXISTS Projeto.Gender_Stats_Series_table
""")

DataFrame[]

In [46]:
spark.sql("""
    CREATE TABLE Projeto.Gender_Stats_Series_table (
        Gender_Stats_Series_ID STRING,
        Series_Code STRING,
        Topic STRING,
        Indicator_Name STRING,
        Short_definition STRING,
        Long_definition STRING,
        Unit_of_measure STRING,
        Periodicity STRING,
        Base_Period STRING,
        Other_notes STRING,
        Aggregation_method STRING,
        Limitations_and_exceptions STRING,
        Notes_from_original_source STRING,
        General_comments STRING,
        Source STRING,
        Statistical_concept_and_methodology STRING,
        Development_relevance STRING,
        Related_source_links STRING,
        Other_web_links STRING,
        Related_indicators STRING,
        License_Type STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Gender_Stats_Series_table'
""")


DataFrame[]

In [47]:
#-----------------------------------------------------ETAPA GOLD-------------------------------------------------

In [48]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_country
    """
)

DataFrame[]

In [49]:
spark.sql(
    """
    CREATE TABLE Projeto.dim_country (
        Country_Dimension_ID STRING, 
        Country_Region_Situation_Name STRING,
        Country_Region_Situation_Code STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_country'
    """
)

DataFrame[]

In [50]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_temporal_measurement
    """
)

DataFrame[]

In [51]:
spark.sql(
    """
    CREATE TABLE Projeto.dim_temporal_measurement (
        Temporal_Measurement_ID STRING,
        Periodicity STRING,
        Base_Period STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_temporal_measurement'
    """
)

DataFrame[]

In [52]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_sources
    """
)

DataFrame[]

In [53]:
spark.sql(
    """
    CREATE TABLE Projeto.dim_sources (
        Sources_Dimension_ID STRING,
        Notes_from_original_source STRING,
        Source STRING,
        Related_source_links STRING,
        Other_web_links STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_sources'
    """
)

DataFrame[]

In [54]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_measurement_methods
    """
)

DataFrame[]

In [55]:
spark.sql(
    """
    CREATE TABLE Projeto.dim_measurement_methods (
        Measurement_Methods_ID STRING,
        Unit_of_measure STRING,
        Aggregation_method STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_measurement_methods'
    """
)

DataFrame[]

In [56]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_gender_series
    """
)

DataFrame[]

In [57]:
spark.sql(
    """
    CREATE TABLE Projeto.dim_gender_series (
        Gender_Series_ID STRING,
        Indicator_Name STRING,
        Indicator_Code STRING,
        Gender STRING,
        Uniform_Age_Group STRING,
        Topic STRING,
        Short_definition STRING,
        Long_definition STRING,
        Other_notes STRING,
        Limitations_and_exceptions STRING,
        General_comments STRING,
        Statistical_concept_and_methodology STRING,
        Development_relevance STRING,
        Related_indicators STRING,
        License_Type STRING,
        Year INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_gender_series'
    """
)

DataFrame[]

In [58]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.gender_data
    """
)

DataFrame[]

In [59]:
spark.sql(
    """
    CREATE TABLE Projeto.gender_data (
        Country_Dimension_ID STRING,
        Temporal_Measurement_ID STRING,
        Sources_Dimension_ID STRING,
        Measurement_Methods_ID STRING,
        Gender_Series_ID STRING,
        Indicator_Number FLOAT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/gender_data'
    """
)

DataFrame[]

In [60]:
#=================================================== CHILD_PROTECTION ===================================================

In [61]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.Child_Protection_table
    """
)

DataFrame[]

In [62]:
spark.sql("""
    CREATE TABLE Projeto.Child_Protection_table (
    Children_Careplan_Id INT,
    Date_Careplan_Started STRING,
    Date_Careplan_Ended STRING,
    Category_of_Abuse STRING,
    Careplan_End_Reason STRING,
    Current_Careplan_Protection STRING,
    Gender STRING,
    Disabled STRING,
    Registration_Disabled STRING,
    Unique STRING,
    Current_Age_Group STRING,
    Start_Month_and_Year STRING,
    End_Month_and_Year STRING,
    Current_Age_Bracket STRING,
    Age_Group_on_Registration STRING, 
    Age_Group_on_deRegistration STRING,
    Uniform_Age_Group STRING,
    Borough STRING,
    Data_Careplan_Started_month INT,
    Data_Careplan_Started_year INT,
    Data_Careplan_Ended_month INT,
    Data_Careplan_Ended_year INT,
    Start_Month_and_Year_month INT,
    Start_Month_and_Year_year INT,
    End_Month_and_Year_month INT,
    End_Month_and_Year_year INT
)
USING DELTA
LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/Child_Protection_table/'
""")

DataFrame[]

In [63]:
#-----------------------------------------------------ETAPA GOLD-------------------------------------------------

In [64]:
# Criar a tabela de dimensão Date
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_date
    """
)

spark.sql(
    """
    CREATE TABLE Projeto.dim_date (
        Date_Dimension_ID STRING,
        Data_Careplan_Started_month INT,
        Data_Careplan_Started_year INT,
        Data_Careplan_Ended_month INT,
        Data_Careplan_Ended_year INT,
        Start_Month_and_Year_month INT,
        Start_Month_and_Year_year INT,
        End_Month_and_Year_month INT,
        End_Month_and_Year_year INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_date'
    """
)

DataFrame[]

In [65]:
# Criar a tabela de dimensão Child
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_child
    """
)

spark.sql(
    """
    CREATE TABLE Projeto.dim_child (
        Child_Dimension_ID STRING,
        Gender STRING,
        Disabled STRING,
        Registration_Disabled STRING,
        Unique STRING,
        Current_Age_Group STRING,
        Current_Age_Bracket STRING,
        Age_Group_on_Registration STRING, 
        Age_Group_on_deRegistration STRING,
        Uniform_Age_Group STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_child'
    """
)

DataFrame[]

In [66]:
# Criar a tabela de dimensão Care Plan
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.dim_care_plan
    """
)

spark.sql(
    """
    CREATE TABLE Projeto.dim_care_plan (
        Care_Plan_Dimension_ID STRING,
        Category_of_Abuse STRING,
        Careplan_End_Reason STRING,
        Current_Careplan_Protection STRING,
        Local STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/dim_care_plan'
    """
)

DataFrame[]

In [67]:
# Criar a tabela de factos Care Plane per Child
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.cp_per_child
    """
)

spark.sql(
    """
    CREATE TABLE Projeto.cp_per_child (
        Data_Dimension_ID STRING,
        Child_Dimension_ID STRING,
        Care_Plan_Dimension_ID STRING,
        Child_with_CP INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/Projeto.db/cp_per_child'
    """
)

DataFrame[]

In [None]:
spark.stop()