- **Name:** 001_Introduction
- **Author:** Shamas Imran
- **Desciption:** Implementing aggregations using dataframe

In [0]:
from pyspark.sql.types import *
from pyspark.sql import Row
from datetime import date 
import random
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# -------------------------------
# 1. Student DataFrame
# -------------------------------
student_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True)
])

student_data = [
    (1, "Alice", 34), 
    (2, "Bob", 45), 
    (3, "Charlie", 29),
    (4, "Shamas", 40)
]

df_student = spark.createDataFrame(student_data, student_schema)

# -------------------------------
# 2. Course DataFrame
# -------------------------------
course_schema = StructType([
    StructField('CourseID', IntegerType(), False),
    StructField('CourseName', StringType(), True),
    StructField('CourseTitle', StringType(), True),
])

course_data = [
    (1, "Physics", "1111"), 
    (2, "Chemistry", "2222"), 
    (3, "English", "3333"),
    (4, "Computer Science", "4444")
]

df_course = spark.createDataFrame(course_data, course_schema)

# -------------------------------
# 3. Enrollment DataFrame
# -------------------------------
enrollment_schema = StructType([
    StructField("EnrollmentID", IntegerType(), False),
    StructField("StudentID_FK", IntegerType(), False),
    StructField("CourseID_FK", IntegerType(), False),
    StructField("EnrollmentDate", DateType(), True)
])

enrollment_data = [
    (1, 1, 1, date(2023, 9, 1)),   # Alice -> Physics
    (2, 2, 2, date(2023, 9, 2)),   # Bob -> Chemistry
    (3, 4, 4, date(2023, 9, 4)),   # Shamas -> Computer Science
    (4, 1, 2, date(2023, 9, 5)),   # Alice -> Chemistry
]

df_enrollment = spark.createDataFrame(enrollment_data, enrollment_schema)

# -------------------------------
# 4. Score DataFrame
# -------------------------------
semesters = ["2023-Spring", "2023-Fall", "2024-Spring", "2024-Fall", "2025-Spring"]
score_data = []
score_id = 1
enrollment_ids = [row.EnrollmentID for row in df_enrollment.collect()]

for enrollment_id in enrollment_ids:
    selected_semesters = random.sample(semesters, k=random.randint(2, 4))
    for sem in selected_semesters:
        score_data.append(Row(
            ScoreID=score_id,
            EnrollmentID_FK=enrollment_id,
            Semester=sem,
            Score=random.randint(60, 100)
        ))
        score_id += 1

score_schema = StructType([
    StructField("ScoreID", IntegerType(), False),
    StructField("EnrollmentID_FK", IntegerType(), False),
    StructField("Semester", StringType(), True),
    StructField("Score", IntegerType(), True)
])

df_score = spark.createDataFrame(score_data, schema=score_schema)

In [0]:
df_student.createOrReplaceTempView("Student")
df_course.createOrReplaceTempView("Course")
df_enrollment.createOrReplaceTempView("Enrollment")
df_score.createOrReplaceTempView("Score")

In [0]:
%sql
-- Example: Moving average of scores within a window of 2 rows before and 2 rows after
SELECT
    e.CourseID_FK,
    s.Score,
    AVG(s.Score) OVER (
        PARTITION BY e.CourseID_FK    -- partition by course
        ORDER BY s.Score              -- order scores
        ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING  -- sliding window of 5 rows (2 before, current, 2 after)
    ) AS MovingAvg
FROM Score s
JOIN Enrollment e
    ON s.EnrollmentID_FK = e.EnrollmentID;


In [0]:
%sql
-- Example 1: Running average score per course
SELECT
    e.CourseID_FK,
    s.Score,
    AVG(s.Score) OVER (
        PARTITION BY e.CourseID_FK
        ORDER BY s.Score
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS RunningAvg
FROM score s
JOIN enrollment e
    ON s.EnrollmentID_FK = e.EnrollmentID;

In [0]:
%sql
-- Example 2: Running Total (from first row to current row)
SELECT
    e.CourseID_FK,
    s.Score,
    SUM(s.Score) OVER (
        PARTITION BY e.CourseID_FK
        ORDER BY s.Score
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS RunningTotal
FROM Score s
JOIN Enrollment e
    ON s.EnrollmentID_FK = e.EnrollmentID;

In [0]:
%sql
-- Forward running total (from current row to last row in the partition)
SELECT
    e.CourseID_FK,
    s.Score,
    SUM(s.Score) OVER (
        PARTITION BY e.CourseID_FK
        ORDER BY s.Score
        ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
    ) AS ForwardTotal
FROM Score s
JOIN Enrollment e
    ON s.EnrollmentID_FK = e.EnrollmentID;

In [0]:
%sql
-- Row Difference: Current row vs Previous row
SELECT
    e.CourseID_FK,
    s.Score,
    s.Score - LAG(s.Score, 1) OVER (
        PARTITION BY e.CourseID_FK
        ORDER BY s.Score
    ) AS RowDiff
FROM Score s
JOIN Enrollment e
    ON s.EnrollmentID_FK = e.EnrollmentID;