In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import time

spark = SparkSession.builder.appName('Join exercise').getOrCreate()
spark


In [64]:
# Sample data for df1
data1 = [("Alice", 1), ("Bob", 2), ("Bob", 7), ("Charlie", 3),("Charlie", 3)]
columns1 = ["name", "id"]
df1 = spark.createDataFrame(data=data1, schema=columns1)
df1.show()


# Sample data for df2
data2 = [("Alice", "Engineer"), ("Bob", "Doctor"), ("David", "Scientist")]
columns2 = ["name", "profession"]
df2 = spark.createDataFrame(data=data2, schema=columns2)
df2.show()

+-------+---+
|   name| id|
+-------+---+
|  Alice|  1|
|    Bob|  2|
|    Bob|  7|
|Charlie|  3|
|Charlie|  3|
+-------+---+

+-----+----------+
| name|profession|
+-----+----------+
|Alice|  Engineer|
|  Bob|    Doctor|
|David| Scientist|
+-----+----------+



In [65]:
df1.createOrReplaceTempView('df1')
df2.createOrReplaceTempView('df2')

In [66]:
# Inner join: This joins both DataFrames on the identifier and retains only the common rows and columns
start_time = time.time()

inner_join = df1.join(df2, df1['name'] == df2['name'], how = 'inner')
inner_join.show()

end_time = time.time()
elapsed_time = start_time - end_time
print(f'Time taken to complete the dataframe inner join is {elapsed_time}.')

+-----+---+-----+----------+
| name| id| name|profession|
+-----+---+-----+----------+
|Alice|  1|Alice|  Engineer|
|  Bob|  2|  Bob|    Doctor|
|  Bob|  7|  Bob|    Doctor|
+-----+---+-----+----------+

Time taken to complete the dataframe inner join is -6.343767166137695.


In [67]:
start_time = time.time()

inner_join_sql = spark.sql('select a.*, b.* from df1 a inner join df2 b on a.name = b.name')
inner_join_sql.show()

end_time = time.time()
elapsed_time = start_time - end_time
print(f'Time taken to complete the sql inner join is {elapsed_time}.')

+-----+---+-----+----------+
| name| id| name|profession|
+-----+---+-----+----------+
|Alice|  1|Alice|  Engineer|
|  Bob|  2|  Bob|    Doctor|
|  Bob|  7|  Bob|    Doctor|
+-----+---+-----+----------+

Time taken to complete the sql inner join is -6.351756811141968.


In [68]:
# Left/Left outer join: A left join merges both the DataFrames on the
# identifier and retains all the rows of the left-hand DataFrame and
# matches any rows that are common with the right-hand DataFrame.
# If there is no equivalent row in the right-hand DataFrame, Spark will
# insert a null for all the columns.

start_time = time.time()

left_join = df1.join(df2, df1['name'] == df2['name'], how = 'left')
left_join.show()

end_time = time.time()
elapsed_time = start_time - end_time
print(f'Time taken to complete the sql left join is {elapsed_time}.')

+-------+---+-----+----------+
|   name| id| name|profession|
+-------+---+-----+----------+
|  Alice|  1|Alice|  Engineer|
|    Bob|  2|  Bob|    Doctor|
|    Bob|  7|  Bob|    Doctor|
|Charlie|  3| null|      null|
|Charlie|  3| null|      null|
+-------+---+-----+----------+

Time taken to complete the sql left join is -6.247480154037476.


In [69]:
start_time = time.time()

left_join_sql = spark.sql('select a.*, b.* from df1 a left join df2 b on a.name = b.name')
left_join_sql.show()

end_time = time.time()
elapsed_time = start_time - end_time
print(f'Time taken to complete the sql left join is {elapsed_time}.')

+-------+---+-----+----------+
|   name| id| name|profession|
+-------+---+-----+----------+
|  Alice|  1|Alice|  Engineer|
|    Bob|  2|  Bob|    Doctor|
|    Bob|  7|  Bob|    Doctor|
|Charlie|  3| null|      null|
|Charlie|  3| null|      null|
+-------+---+-----+----------+

Time taken to complete the sql left join is -6.123517036437988.


In [70]:
# Right/Right outer join: This merges both DataFrames on the
# identifier and retains all the rows of the right-hand DataFrame and
# matches any rows that are common with the left-hand DataFrame.
# If there is no equivalent row in the left-hand DataFrame, Spark will
# insert a null for all the columns.

right_join = df1.join(df2, df1['name'] == df2['name'], how = 'right')
right_join.show()

+-----+----+-----+----------+
| name|  id| name|profession|
+-----+----+-----+----------+
|Alice|   1|Alice|  Engineer|
|  Bob|   7|  Bob|    Doctor|
|  Bob|   2|  Bob|    Doctor|
| null|null|David| Scientist|
+-----+----+-----+----------+



In [71]:
rightt_join_sql = spark.sql('select a.*, b.* from df1 a right join df2 b on a.name = b.name')
rightt_join_sql.show()

+-----+----+-----+----------+
| name|  id| name|profession|
+-----+----+-----+----------+
|Alice|   1|Alice|  Engineer|
|  Bob|   7|  Bob|    Doctor|
|  Bob|   2|  Bob|    Doctor|
| null|null|David| Scientist|
+-----+----+-----+----------+



In [72]:
# Full outer join: This joins both DataFrames on the identifier and
# retains all the rows of both the left and right DataFrames that have
# the same identifier. If there are no equivalent rows in either of the
# DataFrames, Spark will insert a null for all the columns.

full_join = df1.join(df2, df1['name'] == df2['name'], how = 'full')
full_join.show()

+-------+----+-----+----------+
|   name|  id| name|profession|
+-------+----+-----+----------+
|  Alice|   1|Alice|  Engineer|
|    Bob|   2|  Bob|    Doctor|
|    Bob|   7|  Bob|    Doctor|
|Charlie|   3| null|      null|
|Charlie|   3| null|      null|
|   null|null|David| Scientist|
+-------+----+-----+----------+



In [73]:
full_join_sql = spark.sql('select a.*, b.* from df1 a full join df2 b on a.name = b.name')
full_join_sql.show()

+-------+----+-----+----------+
|   name|  id| name|profession|
+-------+----+-----+----------+
|  Alice|   1|Alice|  Engineer|
|    Bob|   2|  Bob|    Doctor|
|    Bob|   7|  Bob|    Doctor|
|Charlie|   3| null|      null|
|Charlie|   3| null|      null|
|   null|null|David| Scientist|
+-------+----+-----+----------+



In [74]:
# Left Anti join: This joins both the DataFrames on the identifier and retains all the rows of left-hand DataFrame that are 
# not present in the right-hand DataFrame. It also only retains the left-hand DataFrame schema..


anti_join = df1.join(df2, on="name", how="left_anti")
anti_join.show()

+-------+---+
|   name| id|
+-------+---+
|Charlie|  3|
|Charlie|  3|
+-------+---+



In [75]:
anti_join_sql = spark.sql('select a.* from df1 a left join df2 b on a.name = b.name where b.name is NULL')
anti_join_sql.show()

+-------+---+
|   name| id|
+-------+---+
|Charlie|  3|
|Charlie|  3|
+-------+---+



In [76]:
# Left Semi join: This is similar to an inner join, except it would not yield the columns from the right-hand DataFrame..

left_semi = df1.join(df2, on="name", how="left_semi")
left_semi.show()

+-----+---+
| name| id|
+-----+---+
|Alice|  1|
|  Bob|  2|
|  Bob|  7|
+-----+---+



In [77]:
left_semi_sql = spark.sql('select a.* from df1 a inner join df2 b on a.name = b.name')
left_semi_sql.show()

+-----+---+
| name| id|
+-----+---+
|Alice|  1|
|  Bob|  2|
|  Bob|  7|
+-----+---+

