## Writing effective join conditions

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Join Condition") \
    .master("local[*]") \
    .getOrCreate()

spark

In [3]:
# Create dataset
# Employee dataset
_emp_data = [
    ["Subham", "D01", 5000, 1],
    ["Rakesh", "D02", 6000, 0],
    ["Rohit", "D02", 7000, 1],
    ["Dinesh", "D03", 10000, 1]
]

# Employee schema
_emp_schema = ["name", "dept_id", "salary", "active"]

# Department dataset
_dept_data = [
    ["D01", "Economics"],
    ["D02", "Science"],
    ["D03", "Arts"]
]

# Department schema
_dept_schema = ["id", "name"]


In [4]:
# Create Employee and Department dataframes
# Employee Dataframe
df_emp = spark.createDataFrame(data = _emp_data, schema= _emp_schema)
df_emp.show()

# Department Dataframe
df_dept = spark.createDataFrame(data = _dept_data, schema= _dept_schema)
df_dept.show()

+------+-------+------+------+
|  name|dept_id|salary|active|
+------+-------+------+------+
|Subham|    D01|  5000|     1|
|Rakesh|    D02|  6000|     0|
| Rohit|    D02|  7000|     1|
|Dinesh|    D03| 10000|     1|
+------+-------+------+------+

+---+---------+
| id|     name|
+---+---------+
|D01|Economics|
|D02|  Science|
|D03|     Arts|
+---+---------+



In [11]:
# Join datasets
# Condition 1 = Join Emp and Dept based on dept id and active record

# Create a list of conditions
join_cond = [df_emp.dept_id == df_dept.id, df_emp.active == 1]

df_join_1 = df_emp.join(df_dept, how="left_outer", on=join_cond)
df_join_1.show()

+------+-------+------+------+----+---------+
|  name|dept_id|salary|active|  id|     name|
+------+-------+------+------+----+---------+
|Subham|    D01|  5000|     1| D01|Economics|
|Rakesh|    D02|  6000|     0|null|     null|
| Rohit|    D02|  7000|     1| D02|  Science|
|Dinesh|    D03| 10000|     1| D03|     Arts|
+------+-------+------+------+----+---------+



In [12]:
# Condition 2 = Join Emp and Dept based on dept id only

df_join_2 = df_emp.join(df_dept, how="left_outer", on=join_cond[0])
df_join_2.show()

+------+-------+------+------+---+---------+
|  name|dept_id|salary|active| id|     name|
+------+-------+------+------+---+---------+
|Subham|    D01|  5000|     1|D01|Economics|
|Rakesh|    D02|  6000|     0|D02|  Science|
| Rohit|    D02|  7000|     1|D02|  Science|
|Dinesh|    D03| 10000|     1|D03|     Arts|
+------+-------+------+------+---+---------+



In [14]:
# Condition 3 = Join Emp and Dept based on dept id and active record = 1 or salary > 5000

# Create a list of conditions
join_cond = [df_emp.dept_id == df_dept.id, ((df_emp.active == 1) | (df_emp.salary > 5000))]

df_join_3 = df_emp.join(df_dept, how="left_outer", on=join_cond)
df_join_3.show()

+------+-------+------+------+---+---------+
|  name|dept_id|salary|active| id|     name|
+------+-------+------+------+---+---------+
|Subham|    D01|  5000|     1|D01|Economics|
|Rakesh|    D02|  6000|     0|D02|  Science|
| Rohit|    D02|  7000|     1|D02|  Science|
|Dinesh|    D03| 10000|     1|D03|     Arts|
+------+-------+------+------+---+---------+



In [20]:
# Condition 4 = Join Emp and Dept based on dept id and active record = 1 and salary > 5000

# Create a list of conditions
join_cond = [df_emp.dept_id == df_dept.id, df_emp.active == 1, df_emp.salary > 5000]

df_join_4 = df_emp.join(df_dept, how="left_outer", on=join_cond)
df_join_4.show()

+------+-------+------+------+----+-------+
|  name|dept_id|salary|active|  id|   name|
+------+-------+------+------+----+-------+
|Subham|    D01|  5000|     1|null|   null|
|Rakesh|    D02|  6000|     0|null|   null|
| Rohit|    D02|  7000|     1| D02|Science|
|Dinesh|    D03| 10000|     1| D03|   Arts|
+------+-------+------+------+----+-------+



In [23]:
# Condition 4 = Join Emp and Dept based on dept id and active record = 1
# Joining condition
join_cond = [df_emp.dept_id == df_dept.id, df_emp.active == 1, df_emp.salary > 5000]

df_join_5 = df_emp.join(df_dept, how="left_outer", on=join_cond[:2])
df_join_5.show()

+------+-------+------+------+----+---------+
|  name|dept_id|salary|active|  id|     name|
+------+-------+------+------+----+---------+
|Subham|    D01|  5000|     1| D01|Economics|
|Rakesh|    D02|  6000|     0|null|     null|
| Rohit|    D02|  7000|     1| D02|  Science|
|Dinesh|    D03| 10000|     1| D03|     Arts|
+------+-------+------+------+----+---------+

