In [1]:
from pyspark.sql import SparkSession, Row, DataFrame, functions as F
from pyspark.sql.functions import split, col, when, monotonically_increasing_id, concat, lit
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, FloatType
from pyspark.sql.functions import col, trim, lit, monotonically_increasing_id
from delta import *
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id

warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
#==============================HOMICIDE_DATASET===============================

In [3]:
#Criação dos dataframes
hdfs_path_Homicide_Victim = "hdfs://hdfs-nn:9000/Datasets/Bronze/Homicide_Dataset/Homicide_Victim_2003_March_2023.csv"
hdfs_path_Homicide_Proceeded = "hdfs://hdfs-nn:9000/Datasets/Bronze/Homicide_Dataset/Homicide_Proceeded_Victim_2003_March_2023.csv"



In [4]:
#Colocação das tabelas nos dataframes

Homicide_Victim_df = spark.read.option("delimiter",",").option("header","true").csv(hdfs_path_Homicide_Victim)
Homicide_Proceeded_df = spark.read.option("delimiter",",").option("header","true").csv(hdfs_path_Homicide_Proceeded)

In [5]:
#Colocar o caracter "_" nos nomes das colunas no lugar dos espaços (em ambos os ficheiros CSV)

def rename_columns(dataframe: DataFrame) -> DataFrame:
    for column in dataframe.columns:  
        new_column_name = column.replace(" ", "_")
        dataframe = dataframe.withColumnRenamed(column, new_column_name) 
    return dataframe
    
Homicide_Victim_df = rename_columns(Homicide_Victim_df)

def rename_columns(dataframe: DataFrame) -> DataFrame:
    for column in dataframe.columns: 
        new_column_name = column.replace(" ", "_")  
        dataframe = dataframe.withColumnRenamed(column, new_column_name)  
    return dataframe

Homicide_Proceeded_df = rename_columns(Homicide_Proceeded_df)

In [6]:
#Alterações de nomes de colunas/tipos de dados

#Para a Homicide_Victim

Homicide_Victim_df = Homicide_Victim_df.withColumnRenamed("Sex", "Victim_Gender")
Homicide_Victim_df = Homicide_Victim_df.withColumn("Count_of_Victims", col("Count_of_Victims").cast("int"))

#Para a Homicide_Proceeded

Homicide_Proceeded_df = Homicide_Proceeded_df.withColumnRenamed("Gender", "Agressor_Gender")
Homicide_Proceeded_df = Homicide_Proceeded_df.withColumnRenamed("Charged/Summonsed", "Charged_Summonsed")
Homicide_Proceeded_df = Homicide_Proceeded_df.withColumnRenamed("Count_of_PPA", "People_Accused_Count")
Homicide_Proceeded_df = Homicide_Proceeded_df.withColumn("People_Accused_Count", col("People_Accused_Count").cast("int"))


In [7]:
#Remoção de linhas dispensáveis à análise (linhas que não contém informação sobre género)

Homicide_Victim_df = Homicide_Victim_df.filter(col("Victim_Gender") != "Unrecorded")
Homicide_Proceeded_df = Homicide_Proceeded_df.filter(col("Agressor_Gender") != "Unrecorded")

In [8]:
#Dividir as datas para colunas com mês e ano

#Para "Recorded_Date" da "Homicide_Victim"

split_date = split(col("Recorded_Date"), "-")
Homicide_Victim_df = Homicide_Victim_df.withColumn("Recorded_Month", when(split_date[0] == "Jan", 1)
                                       .when(split_date[0] == "Feb", 2)
                                       .when(split_date[0] == "Mar", 3)
                                       .when(split_date[0] == "Apr", 4)
                                       .when(split_date[0] == "May", 5)
                                       .when(split_date[0] == "Jun", 6)
                                       .when(split_date[0] == "Jul", 7)
                                       .when(split_date[0] == "Aug", 8)
                                       .when(split_date[0] == "Sep", 9)
                                       .when(split_date[0] == "Oct", 10)
                                       .when(split_date[0] == "Nov", 11)
                                       .when(split_date[0] == "Dec", 12)
                                       .cast("int"))

# Colocar a coluna para o ano, transformando-o em ano de 4 dígitos (Ex: 2003 ao invés de 03)  
Homicide_Victim_df = Homicide_Victim_df.withColumn("Recorded_Year", (split_date[1].cast("int") + 2000))


# Processo semelhante para a "Proceedings_Date" da "Homicide_Proceeded"

split_date = split(col("Proceedings_Date"), "-")
Homicide_Proceeded_df = Homicide_Proceeded_df.withColumn("Proceedings_Month", when(split_date[0] == "Jan", 1)
                                       .when(split_date[0] == "Feb", 2)
                                       .when(split_date[0] == "Mar", 3)
                                       .when(split_date[0] == "Apr", 4)
                                       .when(split_date[0] == "May", 5)
                                       .when(split_date[0] == "Jun", 6)
                                       .when(split_date[0] == "Jul", 7)
                                       .when(split_date[0] == "Aug", 8)
                                       .when(split_date[0] == "Sep", 9)
                                       .when(split_date[0] == "Oct", 10)
                                       .when(split_date[0] == "Nov", 11)
                                       .when(split_date[0] == "Dec", 12)
                                       .cast("int"))
                                               
Homicide_Proceeded_df = Homicide_Proceeded_df.withColumn("Proceedings_Year", (split_date[1].cast("int") + 2000))


In [9]:
#Adicionar uma coluna "Tabela_ID" (é uma boa prática ter uma chave artificial) com valores sequenciais 

#Para a "Homicide_Victim" (EX de ID: HV1, HV2)

Homicide_Victim_df = Homicide_Victim_df.withColumn("Homicide_Victim_Id", concat(lit("HV"), monotonically_increasing_id()))
columns_except_key = [col_name for col_name in Homicide_Victim_df.columns if col_name != "Homicide_Victim_Id"]
Homicide_Victim_df = Homicide_Victim_df.select("Homicide_Victim_Id", *columns_except_key)

#Para a "Homicide_Proceeded" (EX de ID: HP1, HP2)

Homicide_Proceeded_df = Homicide_Proceeded_df.withColumn("Homicide_Proceeded_ID", concat(lit("HP"), monotonically_increasing_id()))
columns_except_key = [col_name for col_name in Homicide_Proceeded_df.columns if col_name != "Homicide_Proceeded_ID"]
Homicide_Proceeded_df = Homicide_Proceeded_df.select("Homicide_Proceeded_ID", *columns_except_key)

In [10]:
#Location para coerencia com o Crime_Dataset

Homicide_Victim_df = Homicide_Victim_df.withColumn("Location", F.lit("London"))

Homicide_Proceeded_df = Homicide_Proceeded_df.withColumn("Location", F.lit("London"))

In [11]:
# Colocar as idades uniformizadas com o Crime_Dataset

# Para a "Homicide_Victim"

Homicide_Victim_df = Homicide_Victim_df.withColumn(
    "Uniform_Age_Group",
    when(
        (col("Age_Group") == "0 to 12") | 
        (col("Age_Group") == "13 to 14") | 
        (col("Age_Group") == "20 to 24"), "Youth 0-24 years"
    ).when(
        col("Age_Group") == "25 to 34", "Adult 25-34 years"
    ).otherwise("Adult 35+ years")
)


# Para a "Homicide_Proceeded" (Contém intervalos diferentes)

Homicide_Proceeded_df = Homicide_Proceeded_df.withColumn(
    "Uniform_Age_Group",
    when(
        (col("Age_Group") == "10 to 14") | 
        (col("Age_Group") == "15 to 19") | 
        (col("Age_Group") == "20 to 24"), "Youth 0-24 years"
    ).when(
        (col("Age_Group") == "25 to 29") |
        (col("Age_Group") == "30 to 34"), "Adult 25-34 years"
    ).otherwise("Adult 35+ years")
)

In [12]:
#Corrigir erro do espaço em Female

def replace_values(df, column_name):
    return df.withColumn(column_name, 
                         when(col(column_name) == "Female ", "Female")
                         .otherwise(col(column_name)))

Homicide_Victim_df = replace_values(Homicide_Victim_df, "Victim_Gender")

Homicide_Proceeded_df = replace_values(Homicide_Proceeded_df, "Agressor_Gender")

In [15]:
Homicide_Victim_df \
    .write \
    .mode("overwrite") \
    .partitionBy("Borough") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .saveAsTable("Projeto.Homicide_Victim_table")

In [16]:
Homicide_Proceeded_df.write \
    .mode("overwrite") \
    .partitionBy("Borough") \
    .format("delta") \
    .option("mergeSchema","true") \
    .saveAsTable("Projeto.Homicide_Proceeded_table")

In [17]:
#======================================================Transformações GOLD======================================================

In [18]:
def replace_values(df):
    return df.withColumn("Borough", 
                         when(col("Borough") == "Kingston Upon Thames", "Kingston upon Thames")
                         .when(col("Borough") == "Hammersmith & Fulham", "Hammersmith and Fulham")
                         .when(col("Borough") == "Richmond Upon Thames", "Richmond upon Thames")
                         .when(col("Borough") == "Kensington & Chelsea", "Kensington and Chelsea")
                         .when(col("Borough") == "Barking & Dagenham", "Barking and Dagenham")
                         .when(col("Borough") == "Ealing ", "Ealing") 
                         .otherwise(col("Borough")))

# Aplicar as substituições
Homicide_Victim_df = replace_values(Homicide_Victim_df)
Homicide_Proceeded_df = replace_values(Homicide_Proceeded_df)

In [19]:
# -----------Criar Dataframes de dimensão necessários e colocar lá dados

In [20]:
#----- Person

#Colocação dos dados distintos do dataframe da tabela Person
Person_df_victim = (Homicide_Victim_df
                    .select(
                        col("Victim_Gender").alias("Gender"),
                        col("Uniform_Age_Group").alias("Age_Group"),
                        col("Officer_Observed_Ethnicity").alias("Ethnicity"))
                    .distinct())

Person_df_proceeded = (Homicide_Proceeded_df
                       .select(
                           col("Agressor_Gender").alias("Gender"),
                           col("Uniform_Age_Group").alias("Age_Group"),
                           col("Self_Classified_Ethnicity").alias("Ethnicity"))
                       .distinct())

combined_persons = Person_df_victim.union(Person_df_proceeded).distinct()

# Adicionar um ID único e sequencial a cada entrada
Person_df = combined_persons.withColumn("Id_Person", monotonically_increasing_id())
Person_df = Person_df.withColumn("Id_Person", col("Id_Person").cast("int"))


In [21]:
# ----- Date

#Colocação dos dados distintos do dataframe da tabela Date
Date_df_victim = (Homicide_Victim_df
                  .select(
                      col("Recorded_Year").alias("Year"),
                      col("Recorded_Month").alias("Month"))
                  .distinct())

Date_df_proceeded = (Homicide_Proceeded_df
                     .select(
                         col("Proceedings_Year").alias("Year"),
                         col("Proceedings_Month").alias("Month"))
                     .distinct())

new_dates = Date_df_proceeded.join(Date_df_victim, ["Year", "Month"], "left_anti")
Date_df = Date_df_victim.union(new_dates).distinct()

Date_df = Date_df.withColumn("Id_Date", monotonically_increasing_id())
Date_df = Date_df.withColumn("Id_Date", col("Id_Date").cast("int"))

In [22]:
# ----- Location

#Colocação dos dados distintos do dataframe da tabela Location
Location_df_victim = (Homicide_Victim_df
                      .select(
                          trim(col("Location")).alias("Location"),
                          trim(col("Borough")).alias("Borough"))
                      .distinct())

Location_df_proceeded = (Homicide_Proceeded_df
                         .select(
                             trim(col("Location")).alias("Location"),
                             trim(col("Borough")).alias("Borough"))
                         .distinct())

new_locations = Location_df_proceeded.join(Location_df_victim, ["Location", "Borough"], "left_anti")
Location_df = Location_df_victim.union(new_locations).distinct()

Location_df = Location_df.withColumn("Id_Location", monotonically_increasing_id())
Location_df = Location_df.withColumn("Id_Location", col("Id_Location").cast("int"))

In [23]:
# ----- Homicides
Homicides_df = (Homicide_Victim_df
                .select(
                    col("Homicide_Victim_Id").alias("Id_Homicide"),
                    col("Method_of_Killing").alias("Method_Of_Killing"),
                    col("Domestic_Abuse"),
                    col("Homicide_Offence_Type").alias("Offense_Type"),
                    col("Solved_Status"))
                .distinct())

In [24]:
# -----------Associar os dados das dimensões com os da tabela de factos

In [25]:
# ----- Person

# Associar Homicide_Victim_df com Person_df para obter Id_Person
Homicide_Facts_df = Homicide_Victim_df.join(
    Person_df,
    (Homicide_Victim_df.Victim_Gender == Person_df.Gender) &
    (Homicide_Victim_df.Uniform_Age_Group == Person_df.Age_Group) &
    (Homicide_Victim_df.Officer_Observed_Ethnicity == Person_df.Ethnicity),  # Incluindo etnia na condição de junção
    "left"
).select(
    col("Homicide_Victim_ID").alias("ID_Homicide"),
    col("Id_Person"),
    col("Solved_Status"),
    col("Count_of_Victims"),
    col("Recorded_Year"),
    col("Recorded_Month"),
    col("Location"),
    col("Borough"),
    col("Method_of_Killing"),
    col("Domestic_Abuse"),
    col("Homicide_Offence_Type")
)


# Associar Homicide_Proceeded_df com Person_df para obter Id_Person
Proceeded_Facts_df = Homicide_Proceeded_df.join(
    Person_df,
    (Homicide_Proceeded_df.Agressor_Gender == Person_df.Gender) &
    (Homicide_Proceeded_df.Uniform_Age_Group == Person_df.Age_Group) &
    (Homicide_Proceeded_df.Self_Classified_Ethnicity == Person_df.Ethnicity),  # Incluindo etnia na condição de junção
    "left"
).select(
    col("Homicide_Proceeded_ID").alias("ID_Proceeded"),  
    col("Id_Person"),
    col("Proceedings_Year"),
    col("Proceedings_Month"),
    col("Location"),
    col("Borough"),
    col("Offence_Type"),
    col("People_Accused_Count"), 
    col("Offence_Type"), 
    col ("Charged_Summonsed")
)

In [27]:
# ----- Date

# Associar com Date_df para obter Id_Data
Homicide_Facts_df = Homicide_Facts_df.join(
    Date_df,
    (Homicide_Facts_df.Recorded_Year == Date_df.Year) &
    (Homicide_Facts_df.Recorded_Month == Date_df.Month),
    "left"
).select(
    "ID_Homicide", "Id_Person", "Id_Date",
    "Solved_Status", "Count_of_Victims",
    "Location", "Borough", "Method_of_Killing", "Domestic_Abuse", "Homicide_Offence_Type"
)


# Associar com Date_df para obter Id_Date
Proceeded_Facts_df = Proceeded_Facts_df.join(
    Date_df,
    (Proceeded_Facts_df.Proceedings_Year == Date_df.Year) &
    (Proceeded_Facts_df.Proceedings_Month == Date_df.Month),
    "left"
).select(
    "ID_Proceeded", "Id_Person", "Id_Date", "Offence_Type", "Charged_Summonsed",
    "Location", "Borough", "Offence_Type", "People_Accused_Count"
)

In [30]:
# ----- Location

# Associar com Location_df para obter Id_Location
Homicide_Facts_df = Homicide_Facts_df.join(
    Location_df,
    (Homicide_Facts_df.Location == Location_df.Location) &
    (Homicide_Facts_df.Borough == Location_df.Borough),
    "left"
).select(
    "ID_Homicide", "Id_Person", "Id_Date", "Id_Location",
    "Solved_Status", "Count_of_Victims",
    "Method_of_Killing", "Domestic_Abuse", "Homicide_Offence_Type"
)


# Associar com Location_df para obter Id_Location
Proceeded_Facts_df = Proceeded_Facts_df.join(
    Location_df,
    (Proceeded_Facts_df.Location == Location_df.Location) &
    (Proceeded_Facts_df.Borough == Location_df.Borough),
    "left"
).select(
    "ID_Proceeded", "Id_Person", "Id_Date", "Id_Location",
    "Offence_Type", "People_Accused_Count", "Charged_Summonsed"
)



In [31]:
# Novo facto: Valor da percentagem total

total_rowsHomicide = Homicide_Facts_df.count()
percentage_of_total_victims = (1 / total_rowsHomicide) * 100
Homicide_Facts_df = Homicide_Facts_df.withColumn("Percentage_of_Total_Victims", lit(percentage_of_total_victims))

total_rowsProceeded = Proceeded_Facts_df.count()
percentage_of_total_proceeds = (1 / total_rowsProceeded) * 100
Proceeded_Facts_df = Proceeded_Facts_df.withColumn("Percentage_of_Total_Proceeds", lit(percentage_of_total_proceeds))

In [45]:
Person_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .saveAsTable("Projeto.Person_table")

In [46]:
Date_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .saveAsTable("Projeto.Date_table")

In [47]:
Location_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .saveAsTable("Projeto.Location_table")

In [48]:
Homicides_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .saveAsTable("Projeto.Homicides_table")

In [49]:
Homicide_Facts_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .saveAsTable("Projeto.Homicide_Facts_table")

In [50]:
Proceeded_Facts_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .saveAsTable("Projeto.Proceeded_Facts_table")

In [None]:
spark.stop()