# Ex-2200 - types of joins


In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("JoinExamples").getOrCreate()

# Create employees DataFrame
employees_data = [
    (1, "Alice", "HR"),
    (2, "Bob", "Finance"),
    (3, "Charlie", "IT"),
    (4, "David", "HR"),
    (5, "Emma", "Finance"),
]

df_employees = spark.createDataFrame(employees_data, ["EmployeeID", "Name", "Department"])
df_employees.show()

+----------+-------+----------+
|EmployeeID|   Name|Department|
+----------+-------+----------+
|         1|  Alice|        HR|
|         2|    Bob|   Finance|
|         3|Charlie|        IT|
|         4|  David|        HR|
|         5|   Emma|   Finance|
+----------+-------+----------+



In [12]:
departments_data = [
    ("HR", "Human Resources"),
    ("Finance", "Financial Services"),
    ("IT", "Information Technology"),
    ("Sales", "Sales & Marketing"),
]

df_departments = spark.createDataFrame(departments_data, ["DeptCode", "DeptName"])
df_departments.show()

+--------+--------------------+
|DeptCode|            DeptName|
+--------+--------------------+
|      HR|     Human Resources|
| Finance|  Financial Services|
|      IT|Information Techn...|
|   Sales|   Sales & Marketing|
+--------+--------------------+



In [13]:
df_inner = df_employees.join(df_departments, df_employees.Department == df_departments.DeptCode, "inner")
df_inner.show()

+----------+-------+----------+--------+--------------------+
|EmployeeID|   Name|Department|DeptCode|            DeptName|
+----------+-------+----------+--------+--------------------+
|         2|    Bob|   Finance| Finance|  Financial Services|
|         5|   Emma|   Finance| Finance|  Financial Services|
|         1|  Alice|        HR|      HR|     Human Resources|
|         4|  David|        HR|      HR|     Human Resources|
|         3|Charlie|        IT|      IT|Information Techn...|
+----------+-------+----------+--------+--------------------+



In [14]:
df_left = df_employees.join(df_departments, df_employees.Department == df_departments.DeptCode, "left")
df_left.show()

+----------+-------+----------+--------+--------------------+
|EmployeeID|   Name|Department|DeptCode|            DeptName|
+----------+-------+----------+--------+--------------------+
|         1|  Alice|        HR|      HR|     Human Resources|
|         2|    Bob|   Finance| Finance|  Financial Services|
|         4|  David|        HR|      HR|     Human Resources|
|         5|   Emma|   Finance| Finance|  Financial Services|
|         3|Charlie|        IT|      IT|Information Techn...|
+----------+-------+----------+--------+--------------------+



In [15]:
df_right = df_employees.join(df_departments, df_employees.Department == df_departments.DeptCode, "right")
df_right.show()

+----------+-------+----------+--------+--------------------+
|EmployeeID|   Name|Department|DeptCode|            DeptName|
+----------+-------+----------+--------+--------------------+
|         4|  David|        HR|      HR|     Human Resources|
|         1|  Alice|        HR|      HR|     Human Resources|
|         5|   Emma|   Finance| Finance|  Financial Services|
|         2|    Bob|   Finance| Finance|  Financial Services|
|      NULL|   NULL|      NULL|   Sales|   Sales & Marketing|
|         3|Charlie|        IT|      IT|Information Techn...|
+----------+-------+----------+--------+--------------------+



In [16]:
df_full = df_employees.join(df_departments, df_employees.Department == df_departments.DeptCode, "full")
df_full.show()

+----------+-------+----------+--------+--------------------+
|EmployeeID|   Name|Department|DeptCode|            DeptName|
+----------+-------+----------+--------+--------------------+
|         2|    Bob|   Finance| Finance|  Financial Services|
|         5|   Emma|   Finance| Finance|  Financial Services|
|         1|  Alice|        HR|      HR|     Human Resources|
|         4|  David|        HR|      HR|     Human Resources|
|         3|Charlie|        IT|      IT|Information Techn...|
|      NULL|   NULL|      NULL|   Sales|   Sales & Marketing|
+----------+-------+----------+--------+--------------------+



In [17]:
df_cross = df_employees.crossJoin(df_departments)
df_cross.show()

+----------+-------+----------+--------+--------------------+
|EmployeeID|   Name|Department|DeptCode|            DeptName|
+----------+-------+----------+--------+--------------------+
|         1|  Alice|        HR|      HR|     Human Resources|
|         1|  Alice|        HR| Finance|  Financial Services|
|         2|    Bob|   Finance|      HR|     Human Resources|
|         2|    Bob|   Finance| Finance|  Financial Services|
|         1|  Alice|        HR|      IT|Information Techn...|
|         1|  Alice|        HR|   Sales|   Sales & Marketing|
|         2|    Bob|   Finance|      IT|Information Techn...|
|         2|    Bob|   Finance|   Sales|   Sales & Marketing|
|         3|Charlie|        IT|      HR|     Human Resources|
|         3|Charlie|        IT| Finance|  Financial Services|
|         4|  David|        HR|      HR|     Human Resources|
|         4|  David|        HR| Finance|  Financial Services|
|         5|   Emma|   Finance|      HR|     Human Resources|
|       

In [18]:
df_semi = df_employees.join(df_departments, df_employees.Department == df_departments.DeptCode, "leftsemi")
df_semi.show()

+----------+-------+----------+
|EmployeeID|   Name|Department|
+----------+-------+----------+
|         2|    Bob|   Finance|
|         5|   Emma|   Finance|
|         1|  Alice|        HR|
|         4|  David|        HR|
|         3|Charlie|        IT|
+----------+-------+----------+



In [19]:
df_anti = df_employees.join(df_departments, df_employees.Department == df_departments.DeptCode, "leftanti")
df_anti.show()

+----------+----+----------+
|EmployeeID|Name|Department|
+----------+----+----------+
+----------+----+----------+

