- **Name:** 001_Introduction
- **Author:** Shamas Imran
- **Desciption:** Implementing aggregations using sql which we implemeted using dataframe

In [0]:
from pyspark.sql.types import *
from pyspark.sql import Row
from datetime import date
import random

# -------------------------------
# 1. Student DataFrame
# -------------------------------
student_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True)
])

student_data = [
    (1, "Alice", 34), 
    (2, "Bob", 45), 
    (3, "Charlie", 29),
    (4, "Shamas", 40)
]

df_student = spark.createDataFrame(student_data, student_schema)

# -------------------------------
# 2. Course DataFrame
# -------------------------------
course_schema = StructType([
    StructField('CourseID', IntegerType(), False),
    StructField('CourseName', StringType(), True),
    StructField('CourseTitle', StringType(), True),
])

course_data = [
    (1, "Physics", "1111"), 
    (2, "Chemistry", "2222"), 
    (3, "English", "3333"),
    (4, "Computer Science", "4444")
]

df_course = spark.createDataFrame(course_data, course_schema)

# -------------------------------
# 3. Enrollment DataFrame
# -------------------------------
enrollment_schema = StructType([
    StructField("EnrollmentID", IntegerType(), False),
    StructField("StudentID_FK", IntegerType(), False),
    StructField("CourseID_FK", IntegerType(), False),
    StructField("EnrollmentDate", DateType(), True)
])

enrollment_data = [
    (1, 1, 1, date(2023, 9, 1)),   # Alice -> Physics
    (2, 2, 2, date(2023, 9, 2)),   # Bob -> Chemistry
    (3, 4, 4, date(2023, 9, 4)),   # Shamas -> Computer Science
    (4, 1, 2, date(2023, 9, 5)),   # Alice -> Chemistry
]

df_enrollment = spark.createDataFrame(enrollment_data, enrollment_schema)

# -------------------------------
# 4. Score DataFrame
# -------------------------------
semesters = ["2023-Spring", "2023-Fall", "2024-Spring", "2024-Fall", "2025-Spring"]
score_data = []
score_id = 1
enrollment_ids = [row.EnrollmentID for row in df_enrollment.collect()]

for enrollment_id in enrollment_ids:
    selected_semesters = random.sample(semesters, k=random.randint(2, 4))
    for sem in selected_semesters:
        score_data.append(Row(
            ScoreID=score_id,
            EnrollmentID_FK=enrollment_id,
            Semester=sem,
            Score=random.randint(60, 100)
        ))
        score_id += 1

score_schema = StructType([
    StructField("ScoreID", IntegerType(), False),
    StructField("EnrollmentID_FK", IntegerType(), False),
    StructField("Semester", StringType(), True),
    StructField("Score", IntegerType(), True)
])

df_score = spark.createDataFrame(score_data, schema=score_schema)

In [0]:
# Register DataFrames as Temp Views
df_student.createOrReplaceTempView("student")
df_course.createOrReplaceTempView("course")
df_enrollment.createOrReplaceTempView("enrollment")
df_score.createOrReplaceTempView("score")


In [0]:
%sql
-- 1. Count of enrollments per student
SELECT 
    StudentID_FK,
    COUNT(*) AS TotalEnrollments
FROM enrollment
GROUP BY StudentID_FK;

In [0]:
%sql
-- 2. Average score per student
SELECT 
    e.StudentID_FK,
    AVG(s.Score) AS AvgScore
FROM score s
INNER JOIN enrollment e
    ON s.EnrollmentID_FK = e.EnrollmentID
GROUP BY e.StudentID_FK;

In [0]:
%sql
-- 3. Best score per semester
SELECT 
    Semester,
    MAX(Score) AS BestScore
FROM score
GROUP BY Semester;

In [0]:
%sql
-- 4. Number of unique students per course
SELECT 
    c.CourseName,
    COUNT(DISTINCT st.StudentID) AS UniqueStudents
FROM enrollment e
INNER JOIN student st
    ON e.StudentID_FK = st.StudentID
INNER JOIN course c
    ON e.CourseID_FK = c.CourseID
GROUP BY c.CourseName;