In [0]:
spark

In [0]:
spark.conf.set(
  "fs.azure.account.key.hexastore1234.blob.core.windows.net",
  "Zo+EUYTy5ACS3dZ3yg01mSV+Gc8Ts4FtPL1zxI9ohStKv1El9YjmtvVTz9Om8q7H9bvfQNMk5rSK+ASt6al4Ng=="
)
 
df = spark.read.option("header", True).option("inferSchema", True).csv(
  "wasbs://data@hexastore1234.blob.core.windows.net/course_enrollments.csv"
)
 
df.show()

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+--------------------+-----------+-----

In [0]:
#Exercise Set – Online Course Use Case
# Data Loading
# 1. Load the data with schema inference enabled.
df_inferred = spark.read.option("header", True).option("inferSchema", True).csv(
  "wasbs://data@hexastore1234.blob.core.windows.net/course_enrollments.csv"
)
df_inferred.printSchema()

# 2. Manually define schema and compare both approaches.
from pyspark.sql.types import *

manual_schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])

df_manual = spark.read.option("header", True).schema(manual_schema).csv(
  "wasbs://data@hexastore1234.blob.core.windows.net/course_enrollments.csv"
)
df_manual.printSchema()

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)



In [0]:
# Filtering and Transformation
# 3. Filter records where ProgressPercent < 50 .
ppgt50 = df.filter(df.ProgressPercent < 50)
ppgt50.show()

# 4. Replace null ratings with average rating.
from pyspark.sql.functions import avg

avg_rating = df.select(avg("Rating")).first()[0]
replace_null = df.na.fill({'Rating': avg_rating})
replace_null.show()

# 5. Add column IsActive → 1 if Status is Active, else 0.
from pyspark.sql.functions import when, col

df_status = replace_null.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))
df_status.show()

+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Begin

In [0]:
# Aggregations & Metrics
# 6. Find average progress by course.
avg_prog_course = df_status.groupBy("CourseName").avg("ProgressPercent")
avg_prog_course.show()

# 7. Get count of students in each course category.
count_category = df_status.groupBy("Category").count()
count_category.show()

# 8. Identify the most enrolled course.
from pyspark.sql.functions import desc

high_enrolled_course = df_status.groupBy("CourseName").count().orderBy(desc("count")).limit(1)
high_enrolled_course.show()

+--------------------+--------------------+
|          CourseName|avg(ProgressPercent)|
+--------------------+--------------------+
|Data Analysis wit...|               100.0|
|         Java Basics|                 0.0|
|Machine Learning 101|                60.0|
|Python for Beginners|                85.0|
| Power BI Essentials|                30.0|
+--------------------+--------------------+

+-----------+-----+
|   Category|count|
+-----------+-----+
|Programming|    3|
|         AI|    1|
|  Analytics|    2|
+-----------+-----+

+--------------------+-----+
|          CourseName|count|
+--------------------+-----+
|Python for Beginners|    2|
+--------------------+-----+



In [0]:
# Joins
# 9. Create second CSV: course_details.csv
#         CourseName,DurationWeeks,Instructor
#         Python for Beginners,4,Rakesh
#         Data Analysis with Excel,3,Anjali
#         Power BI Essentials,5,Rekha
#         Java Basics,6,Manoj
#         Machine Learning 101,8,Samir
df_course_details = spark.read.option("header", True).csv(
  "wasbs://data@hexastore1234.blob.core.windows.net/course_details.csv"
)
df_course_details.show()

# 10. Join course_enrollments with course_details to include duration and instructor.
df_joined = df_status.join(df_course_details, on="CourseName", how="left")
df_joined.show()

+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|      

In [0]:
# Window Functions
# 11. Rank students in each course based on ProgressPercent .
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

window_spec = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())
df_ranked = df_joined.withColumn("Rank", rank().over(window_spec))
df_ranked.select("StudentName", "CourseName", "ProgressPercent", "Rank").show()

# 12. Get lead and lag of EnrollDate by Category.
from pyspark.sql.functions import lead, lag

window_cat = Window.partitionBy("Category").orderBy("EnrollDate")
df_leadlag = df_joined.withColumn("LeadDate", lead("EnrollDate", 1).over(window_cat)) \
                      .withColumn("LagDate", lag("EnrollDate", 1).over(window_cat))
df_leadlag.select("StudentName", "Category", "EnrollDate", "LeadDate", "LagDate").show()

+-----------+--------------------+---------------+----+
|StudentName|          CourseName|ProgressPercent|Rank|
+-----------+--------------------+---------------+----+
|     Simran|Data Analysis wit...|            100|   1|
|       Neha|         Java Basics|              0|   1|
|       Zara|Machine Learning 101|             60|   1|
|     Aakash| Power BI Essentials|             30|   1|
|    Ibrahim|Python for Beginners|             90|   1|
|     Aditya|Python for Beginners|             80|   2|
+-----------+--------------------+---------------+----+

+-----------+-----------+----------+----------+----------+
|StudentName|   Category|EnrollDate|  LeadDate|   LagDate|
+-----------+-----------+----------+----------+----------+
|       Zara|         AI|2024-05-17|      NULL|      NULL|
|     Simran|  Analytics|2024-05-12|2024-05-13|      NULL|
|     Aakash|  Analytics|2024-05-13|      NULL|2024-05-12|
|     Aditya|Programming|2024-05-10|2024-05-15|      NULL|
|       Neha|Programming|2

In [0]:
# Pivoting & Formatting
# 13. Pivot data to show total enrollments by Category and Status.
df_pivot = df_joined.groupBy("Category").pivot("Status").count()
df_pivot.show()

# 14. Extract year and month from EnrollDate .
from pyspark.sql.functions import year, month

df_dated = df_joined.withColumn("EnrollYear", year("EnrollDate")) \
                    .withColumn("EnrollMonth", month("EnrollDate"))
df_dated.select("EnrollmentID", "EnrollDate", "EnrollYear", "EnrollMonth").show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|     NULL|    NULL|
|  Analytics|     1|        1|    NULL|
+-----------+------+---------+--------+

+------------+----------+----------+-----------+
|EnrollmentID|EnrollDate|EnrollYear|EnrollMonth|
+------------+----------+----------+-----------+
|      ENR001|2024-05-10|      2024|          5|
|      ENR002|2024-05-12|      2024|          5|
|      ENR003|2024-05-13|      2024|          5|
|      ENR004|2024-05-15|      2024|          5|
|      ENR005|2024-05-17|      2024|          5|
|      ENR006|2024-05-18|      2024|          5|
+------------+----------+----------+-----------+



In [0]:
# Cleaning and Deduplication
# 15. Drop rows where Status is null or empty.
df_clean = df_dated.filter((col("Status").isNotNull()) & (col("Status") != ""))
df_clean.show()

# 16. Remove duplicate enrollments using dropDuplicates() . 
df_final = df_clean.dropDuplicates(["EnrollmentID"])
df_final.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|      2024|          5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|      2024|          5|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|   

In [0]:
# Export
# 17. Write the final cleaned DataFrame to:
# CSV (overwrite mode)
df_final.write.mode("overwrite").option("header", True).csv(
  "wasbs://data@hexastore1234.blob.core.windows.net/output/final_csv"
)

# JSON (overwrite mode)
df_final.write.mode("overwrite").json(
  "wasbs://data@hexastore1234.blob.core.windows.net/output/final_json"
)

# Parquet (snappy compression)
df_final.write.mode("overwrite").option("compression", "snappy").parquet(
  "wasbs://data@hexastore1234.blob.core.windows.net/output/final_parquet"
)
