## Joins

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import struct
from pyspark.sql.types import StructField,StructType,StringType,IntegerType



# Create a Spark session
spark = SparkSession.builder.appName("day3").getOrCreate()



Question: You are given two DataFrames: employees_df and departments_df, which contain information about employees and their respective departments. The schema for the DataFrames is as follows:

employees_df schema:
|-- employee_id: integer (nullable = true)
|-- employee_name: string (nullable = true)
|-- department_id: integer (nullable = true)

departments_df schema:

|-- department_id: integer (nullable = true)
|-- department_name: string (nullable = true)

Employees DataFrame:
                                                                                
+-----------+-------------+-------------+
|employee_id|employee_name|department_id|
+-----------+-------------+-------------+
|1          |Pallavi mam  |101          |
|2          |Bob          |102          |
|3          |Cathy        |101          |
|4          |David        |103          |
|5          |Amrit Sir    |104          |
|6          |Alice        |null         |
|7          |Eva          |null         |
|8          |Frank        |110          |
|9          |Grace        |109          |
|10         |Henry        |null         |
+-----------+-------------+-------------+



Departments DataFrame:
+-------------+------------------------+
|department_id|department_name         |
+-------------+------------------------+
|101          |HR                      |
|102          |Engineering             |
|103          |Finance                 |
|104          |Marketing               |
|105          |Operations              |
|106          |null                    |
|107          |Operations              |
|108          |Production              |
|null         |Finance                 |
|110          |Research and Development|
+-------------+----------------------


In [10]:
# Creating the data for employees_df
employee_data = [
    (1, "Pallavi mam", 101),
    (2, "Bob", 102),
    (3, "Cathy", 101),
    (4, "David", 103),
    (5, "Amrit Sir", 104),
    (6, "Alice", None),
    (7, "Eva", None),
    (8, "Frank", 110),
    (9, "Grace", 109),
    (10, "Henry", None)
]

# Defining the schema for employees_df
employee_schema = StructType([
    StructField("Employee_Id",IntegerType(),True),
    StructField("Employee_name",StringType(),True),
    StructField("department_id",IntegerType(),True)
])

# Creating the employees DataFrame using struct
employees_df = spark.createDataFrame(employee_data, employee_schema)

# Creating the data for departments_df
department_data = [
    (101, "HR"),
    (102, "Engineering"),
    (103, "Finance"),
    (104, "Marketing"),
    (105, "Operations"),
    (106, None),
    (107, "Operations"),
    (108, "Production"),
    (None, "Finance"),
    (110, "Research and Development")
]

# Creating the departments DataFrame using struct
department_schema = StructType([
    StructField("department_id",IntegerType(),True),
    StructField("department_name",StringType(),True)
])

department_df = spark.createDataFrame(data=department_data, schema=department_schema)


print("Employees DataFrame:")
employees_df.show()

print("Departments DataFrame:")
department_df.show()


Employees DataFrame:
+-----------+-------------+-------------+
|Employee_Id|Employee_name|department_id|
+-----------+-------------+-------------+
|          1|  Pallavi mam|          101|
|          2|          Bob|          102|
|          3|        Cathy|          101|
|          4|        David|          103|
|          5|    Amrit Sir|          104|
|          6|        Alice|         null|
|          7|          Eva|         null|
|          8|        Frank|          110|
|          9|        Grace|          109|
|         10|        Henry|         null|
+-----------+-------------+-------------+

Departments DataFrame:
+-------------+--------------------+
|department_id|     department_name|
+-------------+--------------------+
|          101|                  HR|
|          102|         Engineering|
|          103|             Finance|
|          104|           Marketing|
|          105|          Operations|
|          106|                null|
|          107|          Operation

In [27]:
#CREATE TEMP TAVLE FOR EMPLOYEE AND DEPARTMENT
department_df.createOrReplaceTempView("department")
employees_df.createOrReplaceTempView("employee")


### Join Expressions

Question: How can you combine the employees_df and departments_df DataFrames based on the common "department_id" column to get a combined DataFrame with employee names and their respective department names?

In [11]:
#pyspark

joinExpression = department_df['department_id'] == employees_df['department_id']
department_df.join(employees_df, joinExpression).select(department_df['department_id'],employees_df['employee_id'],employees_df['employee_name'],department_df['department_name']).show()



+-------------+-----------+-------------+--------------------+
|department_id|employee_id|employee_name|     department_name|
+-------------+-----------+-------------+--------------------+
|          101|          1|  Pallavi mam|                  HR|
|          101|          3|        Cathy|                  HR|
|          102|          2|          Bob|         Engineering|
|          103|          4|        David|             Finance|
|          104|          5|    Amrit Sir|           Marketing|
|          110|          8|        Frank|Research and Deve...|
+-------------+-----------+-------------+--------------------+



                                                                                

In [30]:
#sql

join_sql = spark.sql("""
                    select d.department_id, e.employee_id, e.employee_name, d.department_name
                    from department d
                    join employee e
                    on d.department_id = e.department_id
""")
                     
join_sql.show()

+-------------+-----------+-------------+--------------------+
|department_id|employee_id|employee_name|     department_name|
+-------------+-----------+-------------+--------------------+
|          101|          1|  Pallavi mam|                  HR|
|          101|          3|        Cathy|                  HR|
|          102|          2|          Bob|         Engineering|
|          103|          4|        David|             Finance|
|          104|          5|    Amrit Sir|           Marketing|
|          110|          8|        Frank|Research and Deve...|
+-------------+-----------+-------------+--------------------+



### Inner Joins

Question: How can you retrieve employee names and their respective department names for employees belonging to the "Engineering" department?

In [29]:
#pyspark

#inner join expression
joinExpression = department_df['department_id'] == employees_df['department_id']
#filtering engineering department using where
department_df.join(employees_df, joinExpression).select(employees_df['employee_name'],department_df['department_name']).where(department_df['department_name']=='Engineering').show()

+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|          Bob|    Engineering|
+-------------+---------------+



In [31]:
#sql

inner_join_sql = spark.sql("""
                           select e.employee_name, d.department_name
                           from department d
                           join employee e
                           on e.department_id = d.department_id
                           where d.department_name = 'Engineering' 
 """)
inner_join_sql.show()


+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|          Bob|    Engineering|
+-------------+---------------+



### Outer Joins

Question: Retrieve a DataFrame that contains all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [19]:
#pyspark

#outer join expression
joinTYpe='outer'
Outer_join_df=department_df.join(employees_df, joinExpression, joinTYpe).select(employees_df['employee_name'],department_df['department_name'])

#fill null values with "No department"

Outer_join_df.na.fill("No department").show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|No department|             Finance|
|        Alice|       No department|
|          Eva|       No department|
|        Henry|       No department|
|  Pallavi mam|                  HR|
|        Cathy|                  HR|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|No department|          Operations|
|No department|       No department|
|No department|          Operations|
|No department|          Production|
|        Grace|       No department|
|        Frank|Research and Deve...|
+-------------+--------------------+



In [32]:
#sql

outer_join_sql = spark.sql("""
                            select coalesce(e.employee_name, 'no Employee') as employee_name, 
                                   coalesce(d.department_name, 'no department') as department_name
                           from employee e
                           full outer join department d
                           on d.department_id = e.department_id
""")
outer_join_sql.show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|        Alice|       no department|
|          Eva|       no department|
|        Henry|       no department|
|  no Employee|             Finance|
|  Pallavi mam|                  HR|
|        Cathy|                  HR|
|          Bob|         Engineering|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|  no Employee|          Operations|
|  no Employee|       no department|
|  no Employee|          Operations|
|  no Employee|          Production|
|        Grace|       no department|
|        Frank|Research and Deve...|
+-------------+--------------------+



### Left Outer Joins

Question: List all employees along with their department names. If an employee doesn't have a department assigned, display "No Department".

In [20]:
#pyspark

joinType="left_outer"
#left joining two dfs and replacing na with "No department"
employees_df.join(department_df,joinExpression,joinType).select(employees_df['employee_name'],department_df['department_name']).na.fill('No Department').show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  HR|
|          Bob|         Engineering|
|        Cathy|                  HR|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|        Alice|       No Department|
|          Eva|       No Department|
|        Frank|Research and Deve...|
|        Grace|       No Department|
|        Henry|       No Department|
+-------------+--------------------+



In [33]:

#SQL

left_outer_join_sql = spark.sql("""
                                    select e.employee_name,coalesce(d.department_name, 'No Department') as department_name
                                    from employee e
                                    left outer join department d
                                    on e.department_id = d.department_id       

                                """)
left_outer_join_sql.show()

+-------------+--------------------+
|employee_name|     department_name|
+-------------+--------------------+
|  Pallavi mam|                  HR|
|          Bob|         Engineering|
|        Cathy|                  HR|
|        David|             Finance|
|    Amrit Sir|           Marketing|
|        Alice|       No Department|
|          Eva|       No Department|
|        Frank|Research and Deve...|
|        Grace|       No Department|
|        Henry|       No Department|
+-------------+--------------------+



### Right Outer Joins

Question: Display a list of departments along with employee names. If a department has no employees, display "No Employees".



In [21]:
#pyspark

joinType="right_outer"
#right joining two dfs and replacing na with "No Employees"
employees_df.join(department_df,joinExpression,joinType).select(department_df['department_name'],employees_df['employee_name']).na.fill('No Employees').show()

+--------------------+-------------+
|     department_name|employee_name|
+--------------------+-------------+
|                  HR|        Cathy|
|                  HR|  Pallavi mam|
|         Engineering|          Bob|
|             Finance|        David|
|           Marketing|    Amrit Sir|
|          Operations| No Employees|
|        No Employees| No Employees|
|          Operations| No Employees|
|          Production| No Employees|
|             Finance| No Employees|
|Research and Deve...|        Frank|
+--------------------+-------------+



In [34]:
#sql

right_outer_join_sql = spark.sql("""
                                select d.department_name as department_name,
                                       coalesce(e.employee_name,'No Employees') as employee_name
                                 from employee e
                                 right outer join department d
                                 on e.department_id = d.department_id
                                """)

right_outer_join_sql.show()

+--------------------+-------------+
|     department_name|employee_name|
+--------------------+-------------+
|                  HR|        Cathy|
|                  HR|  Pallavi mam|
|         Engineering|          Bob|
|             Finance|        David|
|           Marketing|    Amrit Sir|
|          Operations| No Employees|
|                null| No Employees|
|          Operations| No Employees|
|          Production| No Employees|
|             Finance| No Employees|
|Research and Deve...|        Frank|
+--------------------+-------------+



### Left Semi Joins

Question: Retrieve a DataFrame that includes employee names for departments that have employees.



In [23]:
#pyspark

joinType="left_semi"
#shows employee name who has matching records in department df
employees_df.join(department_df,joinExpression,joinType).select(employees_df['employee_name']).show()

+-------------+
|employee_name|
+-------------+
|  Pallavi mam|
|        Cathy|
|          Bob|
|        David|
|    Amrit Sir|
|        Frank|
+-------------+



                                                                                

In [35]:
#sql

left_semi_join_sql = spark.sql("""
                               select e.employee_name
                               from employee e
                               left semi join department d
                               on e.department_id = d.department_id  
                               """ )

left_semi_join_sql.show()


+-------------+
|employee_name|
+-------------+
|  Pallavi mam|
|        Cathy|
|          Bob|
|        David|
|    Amrit Sir|
|        Frank|
+-------------+



### Left Anti Joins

Question: Find the employees who don't belong to any department.

In [24]:
#pyspark

joinType="left_anti"
#shows employee name who doesnot have matching records in department df
employees_df.join(department_df,joinExpression,joinType).select(employees_df['employee_name']).show()

+-------------+
|employee_name|
+-------------+
|        Alice|
|          Eva|
|        Grace|
|        Henry|
+-------------+



In [36]:
#sql

left_anti_join_sql = spark.sql("""
                               select e.employee_name
                               from employee e
                               left anti join department d
                               on e.department_id = d.department_id       
                               """ )

left_anti_join_sql.show()


+-------------+
|employee_name|
+-------------+
|        Alice|
|          Eva|
|        Grace|
|        Henry|
+-------------+



### Cross (Cartesian) Joins

Question: Create a DataFrame that contains all possible combinations of employees and departments.

In [25]:
#pyspark

joinType = 'cross'
#Cartesian product of two df
#Be careful while using cross joins
employees_df.crossJoin(department_df).show()



+-----------+-------------+-------------+-------------+--------------------+
|Employee_Id|Employee_name|department_id|department_id|     department_name|
+-----------+-------------+-------------+-------------+--------------------+
|          1|  Pallavi mam|          101|          101|                  HR|
|          1|  Pallavi mam|          101|          102|         Engineering|
|          1|  Pallavi mam|          101|          103|             Finance|
|          1|  Pallavi mam|          101|          104|           Marketing|
|          1|  Pallavi mam|          101|          105|          Operations|
|          1|  Pallavi mam|          101|          106|                null|
|          1|  Pallavi mam|          101|          107|          Operations|
|          1|  Pallavi mam|          101|          108|          Production|
|          1|  Pallavi mam|          101|         null|             Finance|
|          1|  Pallavi mam|          101|          110|Research and Deve...|

                                                                                

In [37]:
#sql

cross_join_sql = spark.sql("""
                           select * 
                           from employee
                           cross join department
                           """)
                           
cross_join_sql.show()




+-----------+-------------+-------------+-------------+--------------------+
|Employee_Id|Employee_name|department_id|department_id|     department_name|
+-----------+-------------+-------------+-------------+--------------------+
|          1|  Pallavi mam|          101|          101|                  HR|
|          1|  Pallavi mam|          101|          102|         Engineering|
|          1|  Pallavi mam|          101|          103|             Finance|
|          1|  Pallavi mam|          101|          104|           Marketing|
|          1|  Pallavi mam|          101|          105|          Operations|
|          1|  Pallavi mam|          101|          106|                null|
|          1|  Pallavi mam|          101|          107|          Operations|
|          1|  Pallavi mam|          101|          108|          Production|
|          1|  Pallavi mam|          101|         null|             Finance|
|          1|  Pallavi mam|          101|          110|Research and Deve...|

                                                                                