- **Name:** 001_Introduction
- **Author:** Shamas Imran
- **Desciption:** Implementing joins between dataframes

In [0]:

from pyspark.sql.types import *
from datetime import date


student_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True)
])

student_data = [
        (1, "Alice", 34), 
        (2, "Bob", 45), 
        (3, "Charlie", 29),
        (4, "Shamas", 40)
        ]

df_student = spark.createDataFrame(student_data, student_schema)


course_schema = StructType([
    StructField('CourseID', IntegerType(), False),
    StructField('CourseName', StringType(), True),
    StructField('CourseTitle', StringType(), True),
])

course_data = [
        (1, "Physics", "1111"), 
        (2, "Chemistry", "2222"), 
        (3, "English", "3333"),
        (4, "Computer Science", "4444")
        ]

df_course = spark.createDataFrame(course_data, course_schema)


# Enrollment schema
enrollment_schema = StructType([
    StructField("EnrollmentID", IntegerType(), False),
    StructField("StudentID_FK", IntegerType(), False),
    StructField("CourseID_FK", IntegerType(), False),
    StructField("EnrollmentDate", DateType(), True)
])

enrollment_data = [
    (1, 1, 1, date(2023, 9, 1)),   # Alice -> Physics
    (2, 2, 2, date(2023, 9, 2)),   # Bob -> Chemistry
    (3, 4, 4, date(2023, 9, 4)),   # Shamas -> Computer Science
    (4, 1, 2, date(2023, 9, 5)),   # Alice -> Chemistry
]

df_enrollment = spark.createDataFrame(enrollment_data, enrollment_schema)


In [0]:
# INNER JOIN: returns only students who are enrolled in a course, along with course details
s = df_student.alias("s")
c = df_course.alias("c")
e = df_enrollment.alias("e")

df_inner = e.join(s, e.StudentID_FK == s.StudentID, "inner") \
            .join(c, e.CourseID_FK == c.CourseID, "inner")

df_inner.select("e.EnrollmentID", "s.StudentName", "c.CourseName", "e.EnrollmentDate").show()

In [0]:
# LEFT JOIN: returns all students and their enrollments if available; students with no enrollment will show null for course
s = df_student.alias("s")
c = df_course.alias("c")
e = df_enrollment.alias("e")

df_left = s.join(e, s.StudentID == e.StudentID_FK, "left") \
           .join(c, e.CourseID_FK == c.CourseID, "left")

df_left.select("s.StudentName", "c.CourseName", "e.EnrollmentDate").show()

In [0]:
# RIGHT JOIN: returns all enrollments; if a student is missing, StudentName will be null (hypothetical case)
s = df_student.alias("s")
c = df_course.alias("c")
e = df_enrollment.alias("e")

df_right = s.join(e, s.StudentID == e.StudentID_FK, "right") \
            .join(c, e.CourseID_FK == c.CourseID, "right")

df_right.select("e.EnrollmentID", "s.StudentName", "c.CourseName", "e.EnrollmentDate").show()


In [0]:
s = df_student.alias("s")
c = df_course.alias("c")
e = df_enrollment.alias("e")

# FULL OUTER JOIN: returns all students and all enrollments, matching where possible; unmatched rows from either side show null
df_full = s.join(e, s.StudentID == e.StudentID_FK, "outer")

df_full.select("s.StudentName", "e.EnrollmentID", "e.EnrollmentDate").show()

In [0]:
s = df_student.alias("s")
c = df_course.alias("c")
e = df_enrollment.alias("e")

# CROSS JOIN: every student paired with every course (Cartesian product)
df_cross = s.crossJoin(c)

df_cross.select("s.StudentName", "c.CourseName").show(10, truncate=False)


In [0]:
s = df_student.alias("s")
c = df_course.alias("c")
e = df_enrollment.alias("e")

# LEFT SEMI JOIN: returns students who have at least one enrollment; only columns from left (student) are retained
df_semi = s.join(e, s.StudentID == e.StudentID_FK, "left_semi")

df_semi.select("s.StudentID", "s.StudentName", "s.StudentAge").show()

In [0]:
s = df_student.alias("s")
c = df_course.alias("c")
e = df_enrollment.alias("e")

# LEFT ANTI JOIN: returns students who are not enrolled in any course
df_anti = s.join(e, s.StudentID == e.StudentID_FK, "left_anti")
df_anti.select("s.StudentID", "s.StudentName", "s.StudentAge").show()
