In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

In [0]:
person = spark.createDataFrame([
(0, "Bill Chambers", 0, [100]),
(1, "Matei Zaharia", 1, [500, 250, 100]),
(2, "Michael Armbrust", 1, [250, 100])])\
.toDF("id", "name", "graduate_program", "spark_status")

graduate_program = spark.createDataFrame([
(0, "Masters", "School of Information", "UC Berkeley"),
(2, "Masters", "EECS", "UC Berkeley"),
(1, "Ph.D.", "EECS", "UC Berkeley")])\
.toDF("id", "degree", "department", "school")

spark_status = spark.createDataFrame([
(500, "Vice President"),
(250, "PMC Member"),
(100, "Contributor")])\
.toDF("id", "status")

In [0]:
person.createOrReplaceTempView("person")
graduate_program.createOrReplaceTempView("graduate_program")
spark_status.createOrReplaceTempView("spark_status")

Inner join

In [0]:
join_expression = person['graduate_program'] == graduate_program['id']
join_expression

Column<'`=`(graduate_program, id)'>

In [0]:
wrongJoinExpression = person["name"] == graduate_program["school"]
wrongJoinExpression

Column<'`=`(name, school)'>

In [0]:
join_type = 'inner'

In [0]:
(
    person
    .join(graduate_program, join_expression, join_type)
).show()

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [0]:
%sql
select * from person inner join graduate_program on person.graduate_program = graduate_program.id

id,name,graduate_program,spark_status,id.1,degree,department,school
0,Bill Chambers,0,List(100),0,Masters,School of Information,UC Berkeley
1,Matei Zaharia,1,"List(500, 250, 100)",1,Ph.D.,EECS,UC Berkeley
2,Michael Armbrust,1,"List(250, 100)",1,Ph.D.,EECS,UC Berkeley


Outer join

In [0]:
join_type = 'outer'
(
  person
  .join(graduate_program, join_expression, join_type)
).show()


+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|NULL|            NULL|            NULL|           NULL|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [0]:
%sql
select * from person full outer join graduate_program on person.graduate_program = graduate_program.id

id,name,graduate_program,spark_status,id.1,degree,department,school
0.0,Bill Chambers,0.0,List(100),0,Masters,School of Information,UC Berkeley
1.0,Matei Zaharia,1.0,"List(500, 250, 100)",1,Ph.D.,EECS,UC Berkeley
2.0,Michael Armbrust,1.0,"List(250, 100)",1,Ph.D.,EECS,UC Berkeley
,,,,2,Masters,EECS,UC Berkeley


Left Outer join

In [0]:
join_type = 'left_outer'

(
    graduate_program
    .join(person, join_expression, join_type)
).show()

+---+-------+--------------------+-----------+----+----------------+----------------+---------------+
| id| degree|          department|     school|  id|            name|graduate_program|   spark_status|
+---+-------+--------------------+-----------+----+----------------+----------------+---------------+
|  0|Masters|School of Informa...|UC Berkeley|   0|   Bill Chambers|               0|          [100]|
|  2|Masters|                EECS|UC Berkeley|NULL|            NULL|            NULL|           NULL|
|  1|  Ph.D.|                EECS|UC Berkeley|   2|Michael Armbrust|               1|     [250, 100]|
|  1|  Ph.D.|                EECS|UC Berkeley|   1|   Matei Zaharia|               1|[500, 250, 100]|
+---+-------+--------------------+-----------+----+----------------+----------------+---------------+



In [0]:
%sql
select * from graduate_program left outer join person on graduate_program.id = person.graduate_program

id,degree,department,school,id.1,name,graduate_program,spark_status
0,Masters,School of Information,UC Berkeley,0.0,Bill Chambers,0.0,List(100)
2,Masters,EECS,UC Berkeley,,,,
1,Ph.D.,EECS,UC Berkeley,2.0,Michael Armbrust,1.0,"List(250, 100)"
1,Ph.D.,EECS,UC Berkeley,1.0,Matei Zaharia,1.0,"List(500, 250, 100)"


Right Outer join

In [0]:
join_type = 'right_outer'

(
  person.join(graduate_program, join_expression, join_type)
).show()

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|NULL|            NULL|            NULL|           NULL|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [0]:
%sql
select * from person right outer join graduate_program on person.graduate_program = graduate_program.id

id,name,graduate_program,spark_status,id.1,degree,department,school
0.0,Bill Chambers,0.0,List(100),0,Masters,School of Information,UC Berkeley
,,,,2,Masters,EECS,UC Berkeley
2.0,Michael Armbrust,1.0,"List(250, 100)",1,Ph.D.,EECS,UC Berkeley
1.0,Matei Zaharia,1.0,"List(500, 250, 100)",1,Ph.D.,EECS,UC Berkeley


Left Semi join

In [0]:
join_type = 'left_semi'

(
    graduate_program
    .join(person, join_expression, join_type)
    .show()
)

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+



In [0]:
graduate_program2 = graduate_program.union(spark.createDataFrame([(0,"Masters","Duplicated Row","Duplicated School")]))
graduate_program2.createOrReplaceTempView("graduate_program2")

In [0]:
(
    graduate_program2
    .join(person, join_expression, join_type)
    .show()
)

+---+-------+--------------------+-----------------+
| id| degree|          department|           school|
+---+-------+--------------------+-----------------+
|  0|Masters|School of Informa...|      UC Berkeley|
|  1|  Ph.D.|                EECS|      UC Berkeley|
|  0|Masters|      Duplicated Row|Duplicated School|
+---+-------+--------------------+-----------------+



In [0]:
%sql
select * from graduate_program2 left semi join person on graduate_program2.id = person.graduate_program

id,degree,department,school
0,Masters,School of Information,UC Berkeley
1,Ph.D.,EECS,UC Berkeley
0,Masters,Duplicated Row,Duplicated School


Left Anti join

In [0]:
join_type = 'left_anti'
(
  graduate_program2
  .join(person, join_expression, join_type)
  .show()
)

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  2|Masters|      EECS|UC Berkeley|
+---+-------+----------+-----------+



In [0]:
%sql
select * from graduate_program2 left anti join person on graduate_program2.id = person.graduate_program

id,degree,department,school
2,Masters,EECS,UC Berkeley


Natural Join -> Not suggested

In [0]:
%sql
select * from person natural join graduate_program

id,name,graduate_program,spark_status,degree,department,school
0,Bill Chambers,0,List(100),Masters,School of Information,UC Berkeley
1,Matei Zaharia,1,"List(500, 250, 100)",Ph.D.,EECS,UC Berkeley
2,Michael Armbrust,1,"List(250, 100)",Masters,EECS,UC Berkeley


Cross join or Cartesian join -> Join every row in left with every row in right

In [0]:
graduate_program2.crossJoin(person).show()

+---+-------+--------------------+-----------------+---+----------------+----------------+---------------+
| id| degree|          department|           school| id|            name|graduate_program|   spark_status|
+---+-------+--------------------+-----------------+---+----------------+----------------+---------------+
|  0|Masters|School of Informa...|      UC Berkeley|  0|   Bill Chambers|               0|          [100]|
|  0|Masters|School of Informa...|      UC Berkeley|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  0|Masters|School of Informa...|      UC Berkeley|  2|Michael Armbrust|               1|     [250, 100]|
|  2|Masters|                EECS|      UC Berkeley|  0|   Bill Chambers|               0|          [100]|
|  2|Masters|                EECS|      UC Berkeley|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Masters|                EECS|      UC Berkeley|  2|Michael Armbrust|               1|     [250, 100]|
|  1|  Ph.D.|                EECS|   

In [0]:
%sql
select * from graduate_program2 cross join person

id,degree,department,school,id.1,name,graduate_program,spark_status
0,Masters,School of Information,UC Berkeley,0,Bill Chambers,0,List(100)
0,Masters,School of Information,UC Berkeley,1,Matei Zaharia,1,"List(500, 250, 100)"
0,Masters,School of Information,UC Berkeley,2,Michael Armbrust,1,"List(250, 100)"
2,Masters,EECS,UC Berkeley,0,Bill Chambers,0,List(100)
2,Masters,EECS,UC Berkeley,1,Matei Zaharia,1,"List(500, 250, 100)"
2,Masters,EECS,UC Berkeley,2,Michael Armbrust,1,"List(250, 100)"
1,Ph.D.,EECS,UC Berkeley,0,Bill Chambers,0,List(100)
1,Ph.D.,EECS,UC Berkeley,1,Matei Zaharia,1,"List(500, 250, 100)"
1,Ph.D.,EECS,UC Berkeley,2,Michael Armbrust,1,"List(250, 100)"
0,Masters,Duplicated Row,Duplicated School,0,Bill Chambers,0,List(100)
