In [0]:
spark

In [0]:
# 1. Ingestion & Time Fields
# Load into PySpark with inferred schema
df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/course_enrollments.csv")
df.printSchema()
df.show()

# Convert EnrollDate and CompletionDate to date type
from pyspark.sql.functions import to_date, datediff

df = df.withColumn("EnrollDate", to_date("EnrollDate", "dd-MM-yyyy")) \
       .withColumn("CompletionDate", to_date("CompletionDate", "dd-MM-yyyy"))
df.show()
df.printSchema()

# Add DaysToComplete column if completed
df = df.withColumn("DaysToComplete", datediff("CompletionDate", "EnrollDate"))
df.show()

root
 |-- EnrollID: string (nullable = true)
 |-- UserID: string (nullable = true)
 |-- CourseID: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: string (nullable = true)
 |-- CompletionDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: integer (nullable = true)

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|01-04-2024|    2024-04-10|            100|     4|
|    E002|  U002|    C002|Excel for Finance|Productivity|02-04-2024|          NULL|             45|  NULL|
|    E003|  U003|    C003|  ML with PySpark|Data Science|03-04-2024|          NULL|   

In [0]:
# 2.User Learning Path Progress
from pyspark.sql.functions import col, avg, count, expr, when
# Group by UserID : count of courses enrolled
# Avg progress % across all enrollments
user_progress = df.groupBy("UserID").agg(
    count("CourseID").alias("CoursesEnrolled"),
    avg("ProgressPercent").alias("AvgProgressPercent")
)

user_progress.show()

# Flag IsCompleted = ProgressPercent = 100
df = df.withColumn("IsCompleted", when(col("ProgressPercent") == 100, True).otherwise(False)) 
df.show()

+------+---------------+------------------+
|UserID|CoursesEnrolled|AvgProgressPercent|
+------+---------------+------------------+
|  U004|              1|             100.0|
|  U005|              1|             100.0|
|  U002|              1|              45.0|
|  U003|              1|              30.0|
|  U001|              1|             100.0|
+------+---------------+------------------+

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|       true|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02

In [0]:
# 3. Engagement Scoring
# Create a score: ProgressPercent * Rating (if not null)
from pyspark.sql.functions import when, col

#Replace null Rating with 0
df = df.withColumn("Rating", when(col("Rating").isNull(), 0).otherwise(col("Rating")))
df.show()

df = df.withColumn("EngagementScore", col("ProgressPercent") * col("Rating"))
df.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|       true|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|      false|
|    E003|  U003|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|      false|
|    E004|  U004|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|       true|
|    E005|  U005|    C004|Digital Marketing|   Marketing|2024-

In [0]:
# 4.Identify Drop-offs
# Filter all records with ProgressPercent < 50 and CompletionDate is null
dropouts_df = df.filter(
    (col("ProgressPercent") < 50) &
    (col("CompletionDate").isNull())
)

# Create a view called Dropouts
dropouts_df.createOrReplaceTempView("Dropouts")

#view content
spark.sql("SELECT * FROM Dropouts").show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|      false|              0|
|    E003|  U003|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|      false|              0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+



In [0]:
# 5. Joins with Metadata
# Create course_catalog.csv :
# CourseID,Instructor,DurationHours,Level
# C001,Abdullah Khan,8,Beginner
# C002,Sana Gupta,5,Beginner
# C003,Ibrahim Khan,10,Intermediate
# C004,Zoya Sheikh,6,Beginner

catalog_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/course_catlog.csv")
catalog_df.createOrReplaceTempView("course_catalog")

In [0]:
df.createOrReplaceTempView("enrollments")

spark.sql("""
SELECT 
    c.Instructor, 
    ROUND(AVG(e.ProgressPercent), 2) AS AvgProgress
FROM enrollments e
JOIN course_catalog c 
  ON e.CourseID = c.CourseID
GROUP BY c.Instructor
ORDER BY AvgProgress DESC
""").show()

spark.sql("""
SELECT 
    e.CourseID, 
    c.Instructor,
    COUNT(*) AS TotalEnrollments
FROM enrollments e
JOIN course_catalog c 
  ON e.CourseID = c.CourseID
GROUP BY e.CourseID, c.Instructor
ORDER BY TotalEnrollments DESC
LIMIT 1
""").show()

+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
|Abdullah Khan|      100.0|
|  Zoya Sheikh|      100.0|
|   Sana Gupta|       45.0|
|Inbrahim Khan|       30.0|
+-------------+-----------+

+--------+-------------+----------------+
|CourseID|   Instructor|TotalEnrollments|
+--------+-------------+----------------+
|    C001|Abdullah Khan|               2|
+--------+-------------+----------------+



In [0]:
# 6. Delta Lake Practice
df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/course_enrollments.csv")
df.show()
from pyspark.sql.functions import to_date

df = df.withColumn("EnrollDate", to_date("EnrollDate", "dd-MM-yyyy")) \
       .withColumn("CompletionDate", to_date("CompletionDate", "dd-MM-yyyy"))

# Save as Delta Table enrollments_delta
df.write.format("delta").mode("overwrite").saveAsTable("enrollments_delta")

# Update: Set all ratings to 5 where Course = 'Python Basics'
spark.sql("""
UPDATE enrollments_delta
SET Rating = 5
WHERE CourseName = 'Python Basics'
""").show()

# Delete: All rows where ProgressPercent = 0
spark.sql("""
DELETE FROM enrollments_delta
WHERE ProgressPercent = 0
""").show()

# Show DESCRIBE HISTORY
spark.sql("DESCRIBE HISTORY enrollments_delta").show(truncate=False)

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|01-04-2024|    2024-04-10|            100|     4|
|    E002|  U002|    C002|Excel for Finance|Productivity|02-04-2024|          NULL|             45|  NULL|
|    E003|  U003|    C003|  ML with PySpark|Data Science|03-04-2024|          NULL|             30|  NULL|
|    E004|  U004|    C001|    Python Basics| Programming|04-04-2024|    2024-04-20|            100|     5|
|    E005|  U005|    C004|Digital Marketing|   Marketing|05-04-2024|    2024-04-16|            100|     4|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+

+-----------------+
|num_affected_ro

In [0]:
# 7. Window Functions
# Use dense_rank() to rank courses by number of enrollments
df = spark.read.format("delta").table("enrollments_delta")

from pyspark.sql.functions import count, dense_rank
from pyspark.sql.window import Window

# Count enrollments per course
course_counts = df.groupBy("CourseID", "CourseName").agg(count("*").alias("TotalEnrollments"))

# dense rank window
rank_window = Window.orderBy(course_counts["TotalEnrollments"].desc())

ranked_courses = course_counts.withColumn("Rank", dense_rank().over(rank_window))

ranked_courses.show()

# lead() to find next course by each user (sorted by EnrollDate)
from pyspark.sql.functions import lead
from pyspark.sql.window import Window
user_course_window = Window.partitionBy("UserID").orderBy("EnrollDate")
df_with_next = df.withColumn("NextCourse", lead("CourseName").over(user_course_window))
df_with_next.select("UserID", "CourseName", "EnrollDate", "NextCourse").orderBy("UserID", "EnrollDate")

+--------+-----------------+----------------+----+
|CourseID|       CourseName|TotalEnrollments|Rank|
+--------+-----------------+----------------+----+
|    C001|    Python Basics|               2|   1|
|    C004|Digital Marketing|               1|   2|
|    C002|Excel for Finance|               1|   2|
|    C003|  ML with PySpark|               1|   2|
+--------+-----------------+----------------+----+



DataFrame[UserID: string, CourseName: string, EnrollDate: date, NextCourse: string]

In [0]:
# 8. SQL Logic for Dashboard Views
# Create views:
# daily_enrollments
spark.sql("""
CREATE OR REPLACE TEMP VIEW daily_enrollments AS
SELECT 
  EnrollDate,
  COUNT(*) AS TotalEnrollments
FROM enrollments_delta
GROUP BY EnrollDate
ORDER BY EnrollDate
""")

spark.sql("SELECT * FROM daily_enrollments").show()

# category_performance (avg rating by category)
spark.sql("""
CREATE OR REPLACE TEMP VIEW category_performance AS
SELECT 
  Category,
  ROUND(AVG(Rating), 2) AS AvgRating
FROM enrollments_delta
GROUP BY Category
""")

spark.sql("SELECT * FROM category_performance").show()

# top_3_courses
spark.sql("""
CREATE OR REPLACE TEMP VIEW top_3_courses AS
SELECT 
  CourseName,
  COUNT(*) AS TotalEnrollments
FROM enrollments_delta
GROUP BY CourseName
ORDER BY TotalEnrollments DESC
LIMIT 3
""")

spark.sql("SELECT * FROM top_3_courses").show()

+----------+----------------+
|EnrollDate|TotalEnrollments|
+----------+----------------+
|      NULL|               5|
+----------+----------------+

+------------+---------+
|    Category|AvgRating|
+------------+---------+
| Programming|      5.0|
|Productivity|     NULL|
|   Marketing|      4.0|
|Data Science|     NULL|
+------------+---------+

+-----------------+----------------+
|       CourseName|TotalEnrollments|
+-----------------+----------------+
|    Python Basics|               2|
|Digital Marketing|               1|
|Excel for Finance|               1|
+-----------------+----------------+



In [0]:
# 9. Time Travel
# View previous version before update/delete
spark.sql("DESCRIBE HISTORY enrollments_delta").show(truncate=False)

spark.sql("""
SELECT * FROM enrollments_delta VERSION AS OF 0
""").show()
spark.sql("""
SELECT * FROM enrollments_delta VERSION AS OF 1
""").show()

spark.sql("""
SELECT * FROM enrollments_delta
""").show()

spark.sql("""
SELECT * FROM enrollments_delta TIMESTAMP AS OF '2025-06-19T05:57:27Z'
""").show()

spark.sql("""
CREATE OR REPLACE TEMP VIEW enrollments_before_update AS
SELECT * FROM enrollments_delta VERSION AS OF 0
""")
spark.sql("SELECT * FROM enrollments_before_update").show()


+-------+-------------------+----------------+----------------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+----+-----------------+--------------------+-----------+-----------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp          |userId          |userName                          |operation                        |operationParameters                                                                                                                                     |job 

In [0]:
#Export Reporting
df = spark.read.format("delta").table("enrollments_delta")

# Write to JSON, partitioned by Category
df.write.mode("overwrite") \
  .partitionBy("Category") \
  .json("dbfs:/Workspace/Shared/Exports/enrollments_json_partitioned")

# Create summary DataFrame: CourseName, TotalEnrollments, AvgRating, AvgProgress
from pyspark.sql.functions import count, avg, round

summary_df = df.groupBy("CourseName").agg(
    count("*").alias("TotalEnrollments"),
    round(avg("Rating"), 2).alias("AvgRating"),
    round(avg("ProgressPercent"), 2).alias("AvgProgress")
)
summary_df.show()

# Save as Parquet
summary_df.write.mode("overwrite") \
  .parquet("dbfs:/Workspace/Shared/Exports/course_summary.parquet")

+-----------------+----------------+---------+-----------+
|       CourseName|TotalEnrollments|AvgRating|AvgProgress|
+-----------------+----------------+---------+-----------+
|Digital Marketing|               1|      4.0|      100.0|
|    Python Basics|               2|      5.0|      100.0|
|Excel for Finance|               1|     NULL|       45.0|
|  ML with PySpark|               1|     NULL|       30.0|
+-----------------+----------------+---------+-----------+

