
# EDA & Data Quality Checks
This notebook section performs baseline EDA and data quality checks on:
1. BLS time-series file (pr.data.0.Current)
2. Population API JSON (population.json)

##### The goal is to validate schema, completeness, uniqueness, and numeric sanity before downstream analytics.

## Config & Paths

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import datetime as dt

CATALOG = "rearc_quest"
SCHEMA  = "lakehouse"

spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE SCHEMA {SCHEMA}")

BLS_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_bls/pr.data.0.Current"
POP_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_datausa/population.json"

RUN_TS_UTC = dt.datetime.utcnow().isoformat() + "Z"

print("BLS_PATH:", BLS_PATH)
print("POP_PATH:", POP_PATH)
print("RUN_TS_UTC:", RUN_TS_UTC)


BLS_PATH: /Volumes/rearc_quest/lakehouse/raw_bls/pr.data.0.Current
POP_PATH: /Volumes/rearc_quest/lakehouse/raw_datausa/population.json
RUN_TS_UTC: 2026-01-21T04:31:14.320006Z


  RUN_TS_UTC = dt.datetime.utcnow().isoformat() + "Z"


## Load Data

In [0]:
# ------------------------------------------------------------------------------
# Read BLS fixed-width/space-delimited text
# Format (simplified): series_id year period value
# We treat the raw file as append-only snapshot content and normalize into columns
# ------------------------------------------------------------------------------
bls_raw = spark.read.text(BLS_PATH)

# Split on whitespace; trim first to avoid empty tokens
parts = F.split(F.trim(F.col("value")), r"\s+")

bls = (
    bls_raw
    # Remove blank / whitespace-only lines
    .where(F.length(F.trim("value")) > 0)
    .select(
        parts.getItem(0).alias("series_id"),
        parts.getItem(1).try_cast("int").alias("year"),
        parts.getItem(2).alias("period"),
        parts.getItem(3).cast("double").alias("value")
    )
    # Filters: ensure essential fields exist
    .where(
        F.col("series_id").isNotNull() &
        F.col("year").isNotNull() &
        F.col("period").isNotNull() &
        F.col("value").isNotNull()
    )
)

display(bls.limit(10))


series_id,year,period,value
PRS30006011,1995,Q01,2.6
PRS30006011,1995,Q02,2.1
PRS30006011,1995,Q03,0.9
PRS30006011,1995,Q04,0.1
PRS30006011,1995,Q05,1.4
PRS30006011,1996,Q01,-0.2
PRS30006011,1996,Q02,-0.3
PRS30006011,1996,Q03,-0.1
PRS30006011,1996,Q04,0.2
PRS30006011,1996,Q05,-0.1


In [0]:
from pyspark.sql.functions import explode

# Read the nested JSON with multiLine option
population_raw = spark.read.option("multiLine", "true").json("/Volumes/rearc_quest/lakehouse/raw_datausa/population.json")

# Extract the nested 'data' array and explode it
pop_raw = population_raw.select(explode("data").alias("record")).select("record.*")

display(df)


Nation,Nation ID,Population,Year
United States,01000US,316128839,2013
United States,01000US,318857056,2014
United States,01000US,321418821,2015
United States,01000US,323127515,2016
United States,01000US,325719178,2017
United States,01000US,327167439,2018
United States,01000US,328239523,2019
United States,01000US,331893745,2021
United States,01000US,333287562,2022
United States,01000US,334914896,2023


## Row Counts & Key Cardinalities

In [0]:
bls_rows = bls.count()
pop_rows = pop_raw.count()

bls_series = bls.select("series_id").distinct().count()
bls_years  = bls.select("year").distinct().count()
pop_years  = pop_raw.select("Year").distinct().count()

print("=== Basic Counts ===")
print("BLS rows:", bls_rows)
print("Population rows:", pop_rows)
print("Distinct BLS series_id:", bls_series)
print("Distinct BLS years:", bls_years)
print("Distinct Population years:", pop_years)


=== Basic Counts ===
BLS rows: 37521
Population rows: 10
Distinct BLS series_id: 282
Distinct BLS years: 31
Distinct Population years: 10


## Duplicate Checks (Full row + Business keys)

In [0]:
# Full-row duplicates
bls_dup_rows = bls_rows - bls.dropDuplicates().count()
pop_dup_rows = pop_rows - pop_raw.dropDuplicates().count()

print("=== Duplicate Checks ===")
print("BLS full-row duplicates:", bls_dup_rows)
print("Population full-row duplicates:", pop_dup_rows)

# Business key duplicates
bls_key_dups = (
    bls.groupBy("series_id", "year", "period")
       .count()
       .filter(F.col("count") > 1)
)

print("BLS business-key duplicates (should be 0):", bls_key_dups.count())
display(bls_key_dups.orderBy(F.desc("count")).limit(50))


=== Duplicate Checks ===
BLS full-row duplicates: 0
Population full-row duplicates: 0
BLS business-key duplicates (should be 0): 0


series_id,year,period,count


## Null Profile (Counts + Percent)

In [0]:
def null_profile(df, df_name: str):
    total = df.count()
    counts = df.select([
        F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns
    ])
    perc = df.select([
        (F.sum(F.col(c).isNull().cast("int")) / F.lit(total) * 100).alias(c) for c in df.columns
    ])
    print(f"=== Null Counts: {df_name} ===")
    display(counts)
    print(f"=== Null %: {df_name} ===")
    display(perc)

null_profile(bls, "BLS")
null_profile(pop_raw, "Population")


=== Null Counts: BLS ===


series_id,year,period,value
0,0,0,0


=== Null %: BLS ===


series_id,year,period,value
0.0,0.0,0.0,0.0


=== Null Counts: Population ===


Nation,Nation ID,Population,Year
0,0,0,0


=== Null %: Population ===


Nation,Nation ID,Population,Year
0.0,0.0,0.0,0.0


## Domain Validations (Quarters, Year range, basic constraints)

In [0]:
print("=== Domain Checks: BLS period values ===")
display(bls.select("period").distinct().orderBy("period"))

print("=== Domain Checks: year range ===")
display(bls.select(F.min("year").alias("min_year"), F.max("year").alias("max_year")))
display(pop_raw.select(F.min("Year").alias("min_year"), F.max("Year").alias("max_year")))

# Sanity constraints
neg_bls = bls.filter(F.col("value") < 0).count()
bad_pop = pop_raw.filter(F.col("Population") <= 0).count()

print("Negative BLS values:", neg_bls)
print("Non-positive Population values:", bad_pop)


=== Domain Checks: BLS period values ===


period
Q01
Q02
Q03
Q04
Q05


=== Domain Checks: year range ===


min_year,max_year
1995,2025


min_year,max_year
2013,2023


Negative BLS values: 6828
Non-positive Population values: 0


## Completeness Check (Quarter coverage per series/year)

In [0]:
expected_quarters = 4

completeness = (
    bls.groupBy("series_id", "year")
       .agg(F.countDistinct("period").alias("quarters_present"))
       .withColumn("is_complete_year", F.col("quarters_present") == expected_quarters)
)

display(
    completeness.groupBy("quarters_present").count().orderBy("quarters_present")
)

display(
    completeness.filter(~F.col("is_complete_year"))
                .orderBy(F.desc("quarters_present"))
                .limit(50)
)


quarters_present,count
1,1260
3,237
5,7110


series_id,year,quarters_present,is_complete_year
PRS88003092,2018,5,False
PRS30006162,1995,5,False
PRS88003202,2018,5,False
PRS88003141,2010,5,False
PRS84006113,2018,5,False
PRS85006033,2021,5,False
PRS84006033,2017,5,False
PRS84006163,2001,5,False
PRS31006023,2005,5,False
PRS88003172,2024,5,False


## Descriptive Statistics (Numeric)

In [0]:
print("=== Descriptive Stats: BLS value ===")
display(bls.select("value").describe())

print("=== Descriptive Stats: Population ===")
display(pop_raw.select("Population").describe())


=== Descriptive Stats: BLS value ===


summary,value
count,37521.0
mean,32.35007763652368
stddev,44.79083425559319
min,-54.2
max,412.8


=== Descriptive Stats: Population ===


summary,Population
count,10.0
mean,326075457.4
stddev,6248735.873845867
min,316128839.0
max,334914896.0


## Outlier Detection (IQR, Spark-native)

In [0]:
q1, q3 = bls.approxQuantile("value", [0.25, 0.75], 0.01)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

outliers = bls.filter((F.col("value") < lower) | (F.col("value") > upper))

print("IQR bounds:", lower, upper)
print("Outlier rows:", outliers.count())

display(outliers.select("series_id", "year", "period", "value").orderBy(F.desc("value")).limit(50))


IQR bounds: -123.63799999999999 207.93
Outlier rows: 10


series_id,year,period,value
PRS88003182,2020,Q03,412.8
PRS88003192,2020,Q03,264.3
PRS88003183,2025,Q03,217.549
PRS88003183,2023,Q04,214.425
PRS88003183,2024,Q04,214.029
PRS88003183,2025,Q02,212.144
PRS88003183,2025,Q01,211.117
PRS88003183,2024,Q03,210.634
PRS88003183,2024,Q02,210.231
PRS88003183,2024,Q05,209.468


## Chart

In [0]:

display(bls.groupBy("year").count().orderBy("year"))

# Population trend (line chart)
display(pop_raw.groupBy("Year")
                 .agg(F.sum("Population").alias("total_population"))
                 .orderBy("Year"))

# BLS value distribution (histogram via display on a single column)
display(bls.select("value"))


year,count
1995,1230
1996,1230
1997,1230
1998,1230
1999,1230
2000,1230
2001,1230
2002,1230
2003,1230
2004,1230


Year,total_population
2013,316128839
2014,318857056
2015,321418821
2016,323127515
2017,325719178
2018,327167439
2019,328239523
2021,331893745
2022,333287562
2023,334914896


Databricks visualization. Run in Databricks to view.

value
2.6
2.1
0.9
0.1
1.4
-0.2
-0.3
-0.1
0.2
-0.1


Databricks visualization. Run in Databricks to view.

## Data Quality Summary

In [0]:
dq_summary = {
    "run_utc": RUN_TS_UTC,
    "bls_rows": bls_rows,
    "bls_distinct_series_id": bls_series,
    "bls_distinct_years": bls_years,
    "bls_full_row_duplicates": bls_dup_rows,
    "population_rows": pop_rows,
    "population_distinct_years": pop_years,
    "population_full_row_duplicates": pop_dup_rows,
    "bls_negative_values": neg_bls,
    "population_non_positive_values": bad_pop,
    "bls_outlier_rows_iqr": outliers.count()
}

dq_summary


{'run_utc': '2026-01-21T04:31:14.320006Z',
 'bls_rows': 37521,
 'bls_distinct_series_id': 282,
 'bls_distinct_years': 31,
 'bls_full_row_duplicates': 0,
 'population_rows': 10,
 'population_distinct_years': 10,
 'population_full_row_duplicates': 0,
 'bls_negative_values': 6828,
 'population_non_positive_values': 0,
 'bls_outlier_rows_iqr': 10}

## Persist EDA Summary as a Delta Table

In [0]:
dq_table = f"{CATALOG}.{SCHEMA}.dq_summary_runlog"

dq_df = spark.createDataFrame([dq_summary])
(
    dq_df.write
        .mode("append")
        .format("delta")
        .saveAsTable(dq_table)
)

print(" Appended DQ summary to:", dq_table)


 Appended DQ summary to: rearc_quest.lakehouse.dq_summary_runlog


In [0]:
%sql
select * from rearc_quest.lakehouse.dq_summary_runlog

bls_distinct_series_id,bls_distinct_years,bls_full_row_duplicates,bls_negative_values,bls_outlier_rows_iqr,bls_rows,population_distinct_years,population_full_row_duplicates,population_non_positive_values,population_rows,run_utc
282,31,0,6828,10,37521,10,0,0,10,2026-01-21T04:31:14.320006Z
