 # C8   Joins 

https://datascience-enthusiast.com/Python/big_data_spark_part2
    

In [1]:
# create 3 DF 

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("c8_joins").getOrCreate()

person_data = [(0, "Bill Chambers", 0, [100]),\
               (1, "Matei Zaharia", 1, [500, 250, 100]),\
               (2, "Michael Armbrust", 1, [250, 100])]
person_column = ["id", "name", "graduate_program", "spark_status"]

person = spark.createDataFrame(person_data,person_column)


graduateProgram_data = [(0, "Masters", "School of Information", "UC Berkeley"),\
                        (2, "Masters", "EECS", "UC Berkeley"),\
                        (1, "Ph.D.", "EECS", "UC Berkeley")]
graduateProgram_column = [ "id", "degree", "department", "school"]

graduateProgram = spark.createDataFrame(graduateProgram_data,graduateProgram_column)

sparkStatus = spark.createDataFrame([ (500, "Vice President"), (250, "PMC Member"), (100, "Contributor")] , ["id", "status"])


person.createOrReplaceTempView("person")
graduateProgram.createOrReplaceTempView("graduateProgram")
sparkStatus.createOrReplaceTempView("sparkStatus")

person.show()
graduateProgram.show()
sparkStatus.show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  2|Masters|                EECS|UC Berkeley|
|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+

+---+--------------+
| id|        status|
+---+--------------+
|500|Vice President|
|250|    PMC Member|
|100|   Contributor|
+---+--------------+



# Inner JOIN 

In [2]:
sql = """
select * from 
person A , 
graduateProgram B 
where A.graduate_program = B.id 
"""

sql = """
select * from 
person A  
Inner join 
graduateProgram B 
ON A.graduate_program = B.id 
"""
spark.sql(sql).show(5)

from pyspark.sql.functions import col 

person.join(graduateProgram,person.graduate_program == graduateProgram.id ,"inner").show()

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+--

# OUTER JOIN

In [3]:
sql = """
select * from 
person A 
FULL OUTER JOIN 
graduateProgram B 
ON A.graduate_program = B.id 
"""
spark.sql(sql).show(5)

person.join(graduateProgram, person.graduate_program == graduateProgram.id , "outer").show()

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          dep

# LEFT OUTER JOIN

In [4]:
sql = """
select * from 
person A 
LEFT OUTER JOIN 
graduateProgram B 
ON A.graduate_program = B.id 
"""
spark.sql(sql).show(5)

person.join(graduateProgram, person.graduate_program == graduateProgram.id , "left_outer").show()

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+--

# Right outer JOin 

In [5]:
sql = """
select * from 
person A 
RIGHT OUTER JOIN 
graduateProgram B 
ON A.graduate_program = B.id 
"""
spark.sql(sql).show(5)

person.join(graduateProgram, person.graduate_program == graduateProgram.id , "right_outer").show()

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          dep

# left Semi join 

In [6]:
sql = """
select * from 
person A 
LEFT semi JOIN 
graduateProgram B 
ON A.graduate_program = B.id 
"""
spark.sql(sql).show(5)

person.join(graduateProgram, person.graduate_program == graduateProgram.id , "left_semi").show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



In [7]:
#left Anti JOin 

sql = """
select * from 
person A 
LEFT anti JOIN 
graduateProgram B 
ON A.graduate_program = B.id 
"""
spark.sql(sql).show(5)

+---+----+----------------+------------+
| id|name|graduate_program|spark_status|
+---+----+----------------+------------+
+---+----+----------------+------------+



In [8]:
#CROSS JOIN 

sql = """
select * from 
person A 
CROSS JOIN 
graduateProgram B 
ON A.graduate_program = B.id 
"""
spark.sql(sql).show()

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [9]:
sql = """select * from person"""
spark.sql(sql).show()
sql = """select * from sparkstatus"""
spark.sql(sql).show()

sql = """
select * from 
PERSON A 
INNER JOIN 
Sparkstatus B 
ON 
array_contains(spark_status,B.id)
"""
spark.sql(sql).show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+

+---+--------------+
| id|        status|
+---+--------------+
|500|Vice President|
|250|    PMC Member|
|100|   Contributor|
+---+--------------+

+---+----------------+----------------+---------------+---+--------------+
| id|            name|graduate_program|   spark_status| id|        status|
+---+----------------+----------------+---------------+---+--------------+
|  0|   Bill Chambers|               0|          [100]|100|   Contributor|
|  1|   Matei Zaharia|               1|[500, 250, 100]|500|Vice President|
|  1|   Matei Zaharia|               1|[500, 250, 100]|250|    PMC Member|
|  1|   M

# Join HInts 
    https://towardsdatascience.com/about-joins-in-spark-3-0-1e0ea083ea86

In [10]:
spark.stop()