In [None]:
# Include shared notebooks
%run "../includes/configuration.ipynb"

In [None]:
%run "../includes/common_functions.ipynb"

In [None]:
# rename to differentiate columns from population file
expnd_df = spark.read.format("parquet").load(f"{processed_folder_path}/expenditure") \
.withColumnRenamed("eofy", "end_of_fy") \
.withColumnRenamed("jurisdiction", "expnd_jurisdiction")

In [None]:
# aggregate by year, state, and sector
from pyspark.sql.functions import sum

sector_expnd_df = expnd_df \
    .groupBy("year", "expnd_jurisdiction", "sector", "end_of_fy") \
    .agg(sum("current_amount").alias("total_amount"))

In [None]:
# load population file with selected columns
pop_df = spark.read.format("parquet").load(f"{processed_folder_path}/population") \
.withColumnRenamed("jurisdiction", "pop_jurisdiction")

In [None]:
# join dfs
join_condition = [ sector_expnd_df.end_of_fy == pop_df.Time, sector_expnd_df.expnd_jurisdiction == pop_df.pop_jurisdiction ]

sector_expnd_with_pop_df = sector_expnd_df.join(pop_df, join_condition)

In [None]:
# add calculated column amount_per_person, and created_date column
from pyspark.sql.functions import current_timestamp, expr

final_df = sector_expnd_with_pop_df \
                .withColumn("amount_per_person", expr("total_amount / population")) \
                .withColumn("created_date", current_timestamp())

final_df.show()

In [None]:
### write to Managed Tables (for sql users to use), and still have parquet file (for spark users)
# location defined when create the database
final_df.write.mode("overwrite").format("parquet").saveAsTable("au_health_expnd_db.sector_expnd_with_pop")


# ### write the result to Parquet file only in presentation layer
# final_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/sector_expnd_with_pop"

In [None]:
%sql
SHOW TABLES IN au_health_expnd_db

In [None]:
%sql
SELECT * 
FROM au_health_expnd_db.sector_expnd_with_pop
LIMIT 20;