In [73]:
# Include shared notebooks
%run "../includes/configuration.ipynb"

In [None]:
%run "../includes/common_functions.ipynb"

In [5]:
#### Step 0. explore data initially
### TODO: change the file path
expnd_df = df = spark.read.csv("/mnt/auhealthexpnd23dl/raw/au_health_expnd.csv", header=True, inferSchema=True)
expnd_df.show()

In [67]:
#### Step 1. Define file schema and read file
# will failed if the input file does not follow the defined schema. To prevent populating corrupted data

from pyspark.sql.types import StructType, StructField, DoubleType, StringType, DecimalType

expenditure_schema = StructType(fields=[StructField("Year", StringType(), False),
                                    StructField("Jurisdiction", StringType(), False),
                                    StructField("Sector", StringType(), False),
                                    StructField("Area of expenditure", StringType(), False),
                                    StructField("Broad source of funds", StringType(), False),
                                    StructField("Source of funds", StringType(), False),
                                    StructField("Current amount ($)", DecimalType(20,2), False),
                                    StructField("Constant amount ($)", DecimalType(20,2), False)
                                    ])

### TODO: change the file path
expenditure_df = spark.read.schema(expenditure_schema) \
    .option("header", True) \
    .csv("/mnt/auhealthexpnd23dl/raw/au_health_expnd.csv")

In [68]:
expenditure_df.show()

+-------+------------+--------------------+--------------------+---------------------+--------------------+------------------+-------------------+
|   Year|Jurisdiction|              Sector| Area of expenditure|Broad source of funds|     Source of funds|Current amount ($)|Constant amount ($)|
+-------+------------+--------------------+--------------------+---------------------+--------------------+------------------+-------------------+
|2011-12|         ACT| Capital expenditure| Capital expenditure| Australian Govern...|     DoHAC and other|        7845557.56|         7405237.78|
|2011-12|         ACT| Capital expenditure| Capital expenditure|       Non-government|Other source of f...|       25878223.51|        29469301.22|
|2011-12|         ACT| Capital expenditure| Capital expenditure| State and local g...|State and local g...|      174000000.00|       195754403.88|
|2011-12|         ACT|           Hospitals|   Private hospitals| Australian Govern...|Department of Vet...|       2841

In [76]:
#### Step 2. Rename columns
expnd_renamed_df = expenditure_df.withColumnRenamed("Year", "year")\
                        .withColumnRenamed("Jurisdiction", "jurisdiction")\
                        .withColumnRenamed("Sector", "sector")\
                        .withColumnRenamed("Area of expenditure", "area_of_expenditure")\
                        .withColumnRenamed("Broad source of funds", "broad_source_of_funds")\
                        .withColumnRenamed("Source of funds", "source_of_funds")\
                        .withColumnRenamed("Current amount ($)", "current_amount")\
                        .withColumnRenamed("Constant amount ($)", "constant_amount")

In [77]:
#### Step 3. Add column EOFY end of financial year, and captialise column Jurisdiction (To match with column "Time", and "Jurisdiction" from Population file)

from pyspark.sql.functions import col, substring, lit, when, concat, regexp_replace, upper


# extract the end year from financial year 
df = expnd_renamed_df.withColumn("yy", lit(substring(col("year"), 6, 2)) )
# construct last month of the financial year. Drop the derived column "yy". And captilatise Jurisdiction values
expenditure_eofy_df = df.withColumn("eofy", concat( lit("Jun-"), col("yy") ) )\
                        .drop("yy")\
                        .withColumn("jurisdiction", upper(col("jurisdiction")))

# add ingestion_date
expenditure_final_df = add_ingestion_date(expenditure_eofy_df)


expenditure_final_df.show()

+-------+------------+--------------------+--------------------+---------------------+--------------------+--------------+---------------+------+--------------------+
|   year|jurisdiction|              sector| area_of_expenditure|broad_source_of_funds|     source_of_funds|current_amount|constant_amount|  eofy|      ingestion_date|
+-------+------------+--------------------+--------------------+---------------------+--------------------+--------------+---------------+------+--------------------+
|2011-12|         ACT| Capital expenditure| Capital expenditure| Australian Govern...|     DoHAC and other|    7845557.56|     7405237.78|Jun-12|2023-10-31 15:33:...|
|2011-12|         ACT| Capital expenditure| Capital expenditure|       Non-government|Other source of f...|   25878223.51|    29469301.22|Jun-12|2023-10-31 15:33:...|
|2011-12|         ACT| Capital expenditure| Capital expenditure| State and local g...|State and local g...|  174000000.00|   195754403.88|Jun-12|2023-10-31 15:33:...

In [None]:
#### Step 3. Export the dataframe to Parquet file and store in Processed folder
expenditure_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/expenditure")