In [0]:
from pyspark.sql import functions as F, Window, types as T

# Parameters with default values
Catalog = dbutils.widgets.getArgument('catalog', 'rearcquest')
Schema = dbutils.widgets.getArgument('schema', 'default')

# Volume paths
BlsVolume = f'/Volumes/{Catalog}/{Schema}/bls_data_raw/'
PopulationVolume = f'/Volumes/{Catalog}/{Schema}/population/'
ReportsVolume = f'/Volumes/{Catalog}/{Schema}/reports/'

# File paths
BlsFile = f'{BlsVolume}pr.data.0.Current'
PopulationFile = f'{PopulationVolume}population_data.json'

In [0]:
# Read BLS data from Unity Catalog Volume
blsDf = spark.read.option("header", True) \
    .option("sep", "\t") \
    .option("inferSchema", True) \
    .csv(BlsFile)

blsDf = blsDf.toDF(*[c.strip() for c in blsDf.columns])

# Trim string columns
stringColumns = [field.name for field in blsDf.schema.fields if isinstance(field.dataType, T.StringType)]
print(stringColumns)
for colName in stringColumns:
    blsDf = blsDf.withColumn(colName, F.trim(F.col(colName)))

blsDf.printSchema()
blsDf.show()

In [0]:
# Aggregate BLS data - find best year for each series
sumPerYear = blsDf.groupBy("series_id", "year").agg(F.sum("value").alias("yearly_sum"))

# Get the year with maximum sum for each series
bestYear = sumPerYear.groupBy("series_id").agg(
    F.max_by("year", "yearly_sum").alias("year"),
    F.max("yearly_sum").alias("yearly_sum")
)

bestYear.show()

# Write to Delta table in Unity Catalog
bestYear.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{Catalog}.reports.best_year")

print(f"✓ Saved to table: {Catalog}.reports.best_year")

In [0]:
# Read population data from Unity Catalog Volume
populationDf = spark.read.option("multiline", True).json(PopulationFile)

populationDf = populationDf.select(F.explode("data").alias("data"))
populationDf = populationDf.select("data.*")

populationDf.printSchema()
populationDf.show()

In [0]:
# Filter population data and aggregate
populationDf = populationDf.filter((F.col("Year") >= 2013) & (F.col("Year") <= 2018))

populationStats = populationDf.agg(
    F.mean("Population").alias("mean_population"),
    F.stddev("Population").alias("std_population")
)

populationStats.show()

# Write to Delta table in Unity Catalog
populationStats.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{Catalog}.{Schema}.population_stats")

print(f"✓ Saved to table: {Catalog}.{Schema}.population_stats")

In [0]:
# Join the datasets and filter
targetSeries = "PRS30006032"
targetPeriod = "Q01"

report = blsDf.filter((F.col("series_id") == targetSeries) & (F.col("period") == targetPeriod)) \
              .join(populationDf, blsDf.year == populationDf.Year, how="left") \
              .select(blsDf["*"], "Population")

report.show()

# Write to Delta table in Unity Catalog
report.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{Catalog}.{Schema}.bls_with_population")

print(f"✓ Saved to table: {Catalog}.{Schema}.bls_with_population")