#Problem Statement
Write a solution to find the students who meet the following criteria:

Have taken all mandatory courses and at least two elective courses offered in their major.
Achieved a grade of A in all mandatory courses and at least B in elective courses.
Maintained an average GPA of at least 2.5 across all their courses (including those outside their major).
Return the result table ordered by student_id in ascending order.

###PySpark

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, when

# Initialize Spark session
spark = SparkSession.builder \
    .appName("University Data Analysis") \
    .getOrCreate()

# Sample data for students table
students_data = [
    (1, 'Alice', 'Computer Science'),
    (2, 'Bob', 'Computer Science'),
    (3, 'Charlie', 'Mathematics'),
    (4, 'David', 'Mathematics')
]

# Sample data for courses table
courses_data = [
    (101, 'Algorithms', 3, 'Computer Science', 'yes'),
    (102, 'Data Structures', 3, 'Computer Science', 'yes'),
    (103, 'Calculus', 4, 'Mathematics', 'yes'),
    (104, 'Linear Algebra', 4, 'Mathematics', 'yes'),
    (105, 'Machine Learning', 3, 'Computer Science', 'no'),
    (106, 'Probability', 3, 'Mathematics', 'no'),
    (107, 'Operating Systems', 3, 'Computer Science', 'no'),
    (108, 'Statistics', 3, 'Mathematics', 'no')
]

# Sample data for enrollments table
enrollments_data = [
    (1, 101, 'Fall 2023', 'A', 4.0),
    (1, 102, 'Spring 2023', 'A', 4.0),
    (1, 105, 'Spring 2023', 'A', 4.0),
    (1, 107, 'Fall 2023', 'B', 3.5),
    (2, 101, 'Fall 2023', 'A', 4.0),
    (2, 102, 'Spring 2023', 'B', 3.0),
    (3, 103, 'Fall 2023', 'A', 4.0),
    (3, 104, 'Spring 2023', 'A', 4.0),
    (3, 106, 'Spring 2023', 'A', 4.0),
    (3, 108, 'Fall 2023', 'B', 3.5),
    (4, 103, 'Fall 2023', 'B', 3.0),
    (4, 104, 'Spring 2023', 'B', 3.0)
]

# Create DataFrames
students_df = spark.createDataFrame(students_data, ["student_id", "name", "major"])
courses_df = spark.createDataFrame(courses_data, ["course_id", "name", "credits", "major", "mandatory"])
enrollments_df = spark.createDataFrame(enrollments_data, ["student_id", "course_id", "semester", "grade", "GPA"])

In [0]:
students_df.display()
courses_df.display()
enrollments_df.display()


student_id,name,major
1,Alice,Computer Science
2,Bob,Computer Science
3,Charlie,Mathematics
4,David,Mathematics


course_id,name,credits,major,mandatory
101,Algorithms,3,Computer Science,yes
102,Data Structures,3,Computer Science,yes
103,Calculus,4,Mathematics,yes
104,Linear Algebra,4,Mathematics,yes
105,Machine Learning,3,Computer Science,no
106,Probability,3,Mathematics,no
107,Operating Systems,3,Computer Science,no
108,Statistics,3,Mathematics,no


student_id,course_id,semester,grade,GPA
1,101,Fall 2023,A,4.0
1,102,Spring 2023,A,4.0
1,105,Spring 2023,A,4.0
1,107,Fall 2023,B,3.5
2,101,Fall 2023,A,4.0
2,102,Spring 2023,B,3.0
3,103,Fall 2023,A,4.0
3,104,Spring 2023,A,4.0
3,106,Spring 2023,A,4.0
3,108,Fall 2023,B,3.5


In [0]:

# Filter students with AVG GPA >= 2.5
filtered_students = enrollments_df.groupBy("student_id").agg(avg("GPA").alias("avg_GPA")).filter(col("avg_GPA") >= 2.5)
# Join DataFrames
student_courses = students_df.join(courses_df, students_df.major == courses_df.major) \
                             .join(filtered_students, "student_id") \
                             .join(enrollments_df, ["student_id", "course_id"], "left")

# Apply conditions
result_df = student_courses.groupBy("student_id").agg(
    sum(when((col("mandatory") == 'yes') & (col("grade") == 'A'), 1).otherwise(0)).alias("mandatory_A_count"),
    sum(when(col("mandatory") == 'yes', 1).otherwise(0)).alias("mandatory_count"),
    sum(when((col("mandatory") == 'no') & (col("grade").isNotNull()), 1).otherwise(0)).alias("non_mandatory_count"),
    sum(when((col("mandatory") == 'no') & (col("grade").isin("A", "B")), 1).otherwise(0)).alias("non_mandatory_AB_count")
).filter(
    (col("mandatory_A_count") == col("mandatory_count")) &
    (col("non_mandatory_count") >= 2) &
    (col("non_mandatory_count") == col("non_mandatory_AB_count"))
).select("student_id").orderBy("student_id")
result_df.display()


student_id
1
3


In [0]:
students_df.createOrReplaceTempView("students")
courses_df.createOrReplaceTempView("courses")
enrollments_df.createOrReplaceTempView("enrollments")

###Spark SQL

In [0]:
%sql
WITH T AS (
    SELECT e.student_id
    FROM enrollments e
    GROUP BY e.student_id
    HAVING AVG(e.GPA) >= 2.5
)
SELECT t.student_id
FROM T t
JOIN students s ON t.student_id = s.student_id
JOIN courses c ON s.major = c.major
LEFT JOIN enrollments e ON t.student_id = e.student_id AND c.course_id = e.course_id
GROUP BY t.student_id
HAVING
    SUM(CASE WHEN c.mandatory = 'yes' THEN (CASE WHEN e.grade = 'A' THEN 1 ELSE 0 END) ELSE 0 END) = 
    SUM(CASE WHEN c.mandatory = 'yes' THEN 1 ELSE 0 END)
    AND SUM(CASE WHEN c.mandatory = 'no' AND e.grade IS NOT NULL THEN 1 ELSE 0 END) = 
    SUM(CASE WHEN c.mandatory = 'no' THEN (CASE WHEN e.grade IN ('A', 'B') THEN 1 ELSE 0 END) ELSE 0 END)
    AND SUM(CASE WHEN c.mandatory = 'no' AND e.grade IS NOT NULL THEN 1 ELSE 0 END) >= 2
ORDER BY t.student_id;


student_id
1
3
