In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, count

spark = SparkSession.builder.appName("StudentsCourses").getOrCreate()

students_data = [
    (1, "Rahul Sharma", 20, "Bangalore"),
    (2, "Priya Singh", 21, "Delhi"),
    (3, "Aman Kumar", 19, "Hyderabad"),
    (4, "Sneha Reddy", 22, "Chennai"),
    (5, "Arjun Mehta", 23, "Mumbai"),
    (6, "Divya Nair", 20, None)
]
students_cols = ["student_id", "name", "age", "city"]
students_df = spark.createDataFrame(students_data, students_cols)

courses_data = [
    (101, "Python", "Programming"),
    (102, "Data Science", "Analytics"),
    (103, "Databases", "Technology"),
    (104, "Business Studies", "Management")
]
courses_cols = ["course_id", "course_name", "category"]
courses_df = spark.createDataFrame(courses_data, courses_cols)

enrollment_data = [
    (1, 101, "A"),
    (2, 101, "B"),
    (3, 102, "A"),
    (4, 103, "C"),
    (5, 102, "B"),
    (7, 104, "A")
]
enrollment_cols = ["student_id", "course_id", "grade"]
enrollment_df = spark.createDataFrame(enrollment_data, enrollment_cols)

print("Student names and cities:")
students_df.select("name", "city").show()

print("Students older than 20:")
students_df.filter(col("age") > 20).show()

print("Courses under Analytics category:")
courses_df.filter(col("category") == "Analytics").show()



Student names and cities:
+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|     NULL|
+------------+---------+

Students older than 20:
+----------+-----------+---+-------+
|student_id|       name|age|   city|
+----------+-----------+---+-------+
|         2|Priya Singh| 21|  Delhi|
|         4|Sneha Reddy| 22|Chennai|
|         5|Arjun Mehta| 23| Mumbai|
+----------+-----------+---+-------+

Courses under Analytics category:
+---------+------------+---------+
|course_id| course_name| category|
+---------+------------+---------+
|      102|Data Science|Analytics|
+---------+------------+---------+



In [2]:
print("Students enrolled in each course:")
enrollment_df.groupBy("course_id").agg(count("student_id").alias("student_count")).show()

print("Average age of students per city:")
students_df.groupBy("city").agg(avg("age").alias("avg_age")).show()

print("Max and Min age of students:")
students_df.agg(max("age").alias("max_age"), min("age").alias("min_age")).show()


Students enrolled in each course:
+---------+-------------+
|course_id|student_count|
+---------+-------------+
|      101|            2|
|      102|            2|
|      103|            1|
|      104|            1|
+---------+-------------+

Average age of students per city:
+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   20.0|
|    Delhi|   21.0|
|Hyderabad|   19.0|
|  Chennai|   22.0|
|     NULL|   20.0|
|   Mumbai|   23.0|
+---------+-------+

Max and Min age of students:
+-------+-------+
|max_age|min_age|
+-------+-------+
|     23|     19|
+-------+-------+



In [3]:

print("Students with their enrolled courses:")
students_courses = students_df.join(enrollment_df, "student_id", "inner") \
                              .join(courses_df, "course_id", "inner")
students_courses.select("student_id", "name", "course_name", "grade").show()

print("Left join enrollments with courses:")
enrollment_courses = enrollment_df.join(courses_df, "course_id", "left")
enrollment_courses.show()

print("Students not enrolled in any course:")
students_not_enrolled = students_df.join(enrollment_df, "student_id", "left_anti")
students_not_enrolled.show()

print("Courses with no students enrolled:")
courses_not_enrolled = courses_df.join(enrollment_df, "course_id", "left_anti")
courses_not_enrolled.show()



Students with their enrolled courses:
+----------+------------+------------+-----+
|student_id|        name| course_name|grade|
+----------+------------+------------+-----+
|         2| Priya Singh|      Python|    B|
|         1|Rahul Sharma|      Python|    A|
|         5| Arjun Mehta|Data Science|    B|
|         3|  Aman Kumar|Data Science|    A|
|         4| Sneha Reddy|   Databases|    C|
+----------+------------+------------+-----+

Left join enrollments with courses:
+---------+----------+-----+----------------+-----------+
|course_id|student_id|grade|     course_name|   category|
+---------+----------+-----+----------------+-----------+
|      101|         1|    A|          Python|Programming|
|      101|         2|    B|          Python|Programming|
|      102|         3|    A|    Data Science|  Analytics|
|      103|         4|    C|       Databases| Technology|
|      104|         7|    A|Business Studies| Management|
|      102|         5|    B|    Data Science|  Analytics

In [4]:
students_df.createOrReplaceTempView("students")
courses_df.createOrReplaceTempView("courses")
enrollment_df.createOrReplaceTempView("enrollments")

print("SQL - All students with course names and grades:")
spark.sql("""
    SELECT s.student_id, s.name, c.course_name, e.grade
    FROM students s
    JOIN enrollments e ON s.student_id = e.student_id
    JOIN courses c ON e.course_id = c.course_id
""").show()

print("SQL - Number of students who got grade A in each course:")
spark.sql("""
    SELECT c.course_name, COUNT(*) AS num_A_students
    FROM enrollments e
    JOIN courses c ON e.course_id = c.course_id
    WHERE e.grade = 'A'
    GROUP BY c.course_name
""").show()

print("SQL - Top city with most students enrolled:")
spark.sql("""
    SELECT s.city, COUNT(*) AS total_students
    FROM students s
    JOIN enrollments e ON s.student_id = e.student_id
    WHERE s.city IS NOT NULL
    GROUP BY s.city
    ORDER BY total_students DESC
    LIMIT 1
""").show()

SQL - All students with course names and grades:
+----------+------------+------------+-----+
|student_id|        name| course_name|grade|
+----------+------------+------------+-----+
|         2| Priya Singh|      Python|    B|
|         1|Rahul Sharma|      Python|    A|
|         5| Arjun Mehta|Data Science|    B|
|         3|  Aman Kumar|Data Science|    A|
|         4| Sneha Reddy|   Databases|    C|
+----------+------------+------------+-----+

SQL - Number of students who got grade A in each course:
+----------------+--------------+
|     course_name|num_A_students|
+----------------+--------------+
|Business Studies|             1|
|          Python|             1|
|    Data Science|             1|
+----------------+--------------+

SQL - Top city with most students enrolled:
+---------+--------------+
|     city|total_students|
+---------+--------------+
|Bangalore|             1|
+---------+--------------+

