In [0]:
# MAGIC %md
# MAGIC # Week 4: ETL Pipeline for Dashboard Reporting
# MAGIC
# MAGIC This notebook performs the final ETL to prepare the student progress data for consumption by BI tools, saving the result as a Delta table.

# COMMAND
from pyspark.sql.functions import col, round, lit

# --- 1. MOCK Data Loading (Simulates loading Week 3's cleaned data) ---
# In a production environment, this would be:
# df_progress = spark.read.format("delta").load("/delta/week3_analysis_output")

# MOCK DATAFRAME combining all necessary data from Weeks 1-3
data_etl = [
    (101, 'Alice Johnson', 501, 'Data Science Fundamentals', '2023-09-01', 4.0, 10, 40.00, 'In Progress'),
    (102, 'Bob Smith', 502, 'Cloud Computing Basics', '2023-10-05', 8.0, 8, 100.00, 'Completed'),
    (103, 'Carol Lee', 501, 'Data Science Fundamentals', '2023-11-10', 1.0, 10, 10.00, 'In Progress'),
    (101, 'Alice Johnson', 503, 'Advanced SQL & Databases', '2023-12-01', 0.0, 12, 0.00, 'Dropped'),
    (104, 'David Brown', 504, 'Intro to Tableau', '2023-12-15', 3.0, 15, 20.00, 'In Progress'),
]
columns_etl = [
    "student_id", "student_name", "course_id", "course_name", "enrollment_date",
    "modules_completed", "total_modules", "progress_percentage", "status"
]

df_raw_cleaned = spark.createDataFrame(data_etl, columns_etl)

# Create a temporary view for SQL access
df_raw_cleaned.createOrReplaceTempView("raw_progress_vw")

print("Cleaned data loaded into Databricks temporary view.")

# COMMAND

Cleaned data loaded into Databricks temporary view.


In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW final_report_data_vw AS
SELECT
    student_name,
    course_name AS course,
    CAST(enrollment_date AS DATE) AS enrollment_date,
    ROUND(progress_percentage, 2) AS progress_percentage,
    status AS completion_status
FROM
    raw_progress_vw
WHERE
    student_name IS NOT NULL
ORDER BY
    student_name, enrollment_date;

SELECT * FROM final_report_data_vw;

student_name,course,enrollment_date,progress_percentage,completion_status
Alice Johnson,Data Science Fundamentals,2023-09-01,40.0,In Progress
Alice Johnson,Advanced SQL & Databases,2023-12-01,0.0,Dropped
Bob Smith,Cloud Computing Basics,2023-10-05,100.0,Completed
Carol Lee,Data Science Fundamentals,2023-11-10,10.0,In Progress
David Brown,Intro to Tableau,2023-12-15,20.0,In Progress


In [0]:
# LANGUAGE: Python/PySpark (Databricks Notebook Cell)

# Define the storage path for the Delta table (e.g., in ADLS)
delta_path = "/mnt/datalake/online_course_tracker/final_progress_report_delta"

# Capstone Task: Save results in Delta for dashboarding
print(f"Writing final report data to Delta Lake at: {delta_path}")

# Access the temporary view using spark.table() and write it out
(spark.table("final_report_data_vw")
  .write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .save(delta_path)
)

print("\nSuccessfully created FINAL_PROGRESS_REPORT Delta Table.")

# Verification: Show the data from the newly created Delta table
print("\nVerifying final output from Delta Table:")
spark.sql(f"SELECT * FROM delta.`{delta_path}`").show(truncate=False)

Writing final report data to Delta Lake at: /mnt/datalake/online_course_tracker/final_progress_report_delta

Successfully created FINAL_PROGRESS_REPORT Delta Table.

Verifying final output from Delta Table:
+-------------+-------------------------+---------------+-------------------+-----------------+
|student_name |course                   |enrollment_date|progress_percentage|completion_status|
+-------------+-------------------------+---------------+-------------------+-----------------+
|Alice Johnson|Data Science Fundamentals|2023-09-01     |40.0               |In Progress      |
|Alice Johnson|Advanced SQL & Databases |2023-12-01     |0.0                |Dropped          |
|Bob Smith    |Cloud Computing Basics   |2023-10-05     |100.0              |Completed        |
|Carol Lee    |Data Science Fundamentals|2023-11-10     |10.0               |In Progress      |
|David Brown  |Intro to Tableau         |2023-12-15     |20.0               |In Progress      |
+-------------+----------