# Joins

In [3]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

from delta import *
import pandas as pd

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

# so that we can register UDFs in SQL
spark.builder.enableHiveSupport()

spark

In [5]:
def display(spark_df, rows=10):
    return spark_df.limit(rows).toPandas().head(rows)

In [34]:
%load_ext sparksql_magic

In [22]:
langyear = (spark.createDataFrame([Row(1, 'Python', 1991),
                        Row(2, 'R', 1998),
                        Row(3, 'C++', 1985),
                        Row(5, 'Agda', 2007)
                    ],
                    schema = ['id', 'lang', 'year'])
)

display(langyear)

Unnamed: 0,id,lang,year
0,1,Python,1991
1,2,R,1998
2,3,C++,1985
3,5,Agda,2007


In [23]:
langyear.printSchema()

root
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- year: long (nullable = true)



In [24]:
langdevs = (spark.createDataFrame([Row(3, ['Bjarne Stroustrup']),
                                    Row(1, ['Guido van Rossum']),
                                    Row(2, ['Ross Ihaka', 'Robert Gentleman']),
                                    Row(4, ['Graydon Hoare'])],
                                    schema = ['id', 'devs'])
)

display(langdevs)

Unnamed: 0,id,devs
0,3,[Bjarne Stroustrup]
1,1,[Guido van Rossum]
2,2,"[Ross Ihaka, Robert Gentleman]"
3,4,[Graydon Hoare]


In [25]:
langdevs.printSchema()

root
 |-- id: long (nullable = true)
 |-- devs: array (nullable = true)
 |    |-- element: string (containsNull = true)



## Join Types

### Inner

In [27]:
display(
    langyear.join(langdevs, langdevs.id == langyear.id, 'inner')
)

Unnamed: 0,id,lang,year,id.1,devs
0,1,Python,1991,1,[Guido van Rossum]
1,2,R,1998,2,"[Ross Ihaka, Robert Gentleman]"
2,3,C++,1985,3,[Bjarne Stroustrup]


### Outer

In [49]:
display(
    langyear.join(langdevs, langdevs.id == langyear.id, 'outer').drop(langdevs.id)
)

Unnamed: 0,id,lang,year,devs
0,1.0,Python,1991.0,[Guido van Rossum]
1,2.0,R,1998.0,"[Ross Ihaka, Robert Gentleman]"
2,3.0,C++,1985.0,[Bjarne Stroustrup]
3,,,,[Graydon Hoare]
4,5.0,Agda,2007.0,


### Left Outer

In [50]:
display(
    langyear.join(langdevs, langdevs.id == langyear.id, 'left_outer').drop(langdevs.id)
)

Unnamed: 0,id,lang,year,devs
0,1,Python,1991,[Guido van Rossum]
1,2,R,1998,"[Ross Ihaka, Robert Gentleman]"
2,3,C++,1985,[Bjarne Stroustrup]
3,5,Agda,2007,


### Right Outer

In [51]:
display(
    langyear.join(langdevs, langdevs.id == langyear.id, 'right_outer').drop(langdevs.id)
)

Unnamed: 0,id,lang,year,devs
0,1.0,Python,1991.0,[Guido van Rossum]
1,2.0,R,1998.0,"[Ross Ihaka, Robert Gentleman]"
2,3.0,C++,1985.0,[Bjarne Stroustrup]
3,,,,[Graydon Hoare]


### Left Semi

In [31]:
display(
    langyear.join(langdevs, langdevs.id == langyear.id, 'left_semi')
)

Unnamed: 0,id,lang,year
0,1,Python,1991
1,2,R,1998
2,3,C++,1985


### Left Anti

In [32]:
display(
    langyear.join(langdevs, langdevs.id == langyear.id, 'left_anti')
)

Unnamed: 0,id,lang,year
0,5,Agda,2007


### Natural

In [35]:
langyear.createOrReplaceTempView('langyear')
langdevs.createOrReplaceTempView('langdevs')

In [38]:
%%sparksql
select * from langyear natural join langdevs -- natural inner join

0,1,2,3
id,lang,year,devs
1,Python,1991,['Guido van Rossum']
2,R,1998,"['Ross Ihaka', 'Robert Gentleman']"
3,C++,1985,['Bjarne Stroustrup']


In [39]:
%%sparksql
select * from langyear natural left join langdevs

0,1,2,3
id,lang,year,devs
1,Python,1991,['Guido van Rossum']
2,R,1998,"['Ross Ihaka', 'Robert Gentleman']"
3,C++,1985,['Bjarne Stroustrup']
5,Agda,2007,


### Cartesian

In [46]:
display(
    langyear.crossJoin(langdevs)
, 100)

Unnamed: 0,id,lang,year,id.1,devs
0,1,Python,1991,3,[Bjarne Stroustrup]
1,1,Python,1991,1,[Guido van Rossum]
2,1,Python,1991,2,"[Ross Ihaka, Robert Gentleman]"
3,1,Python,1991,4,[Graydon Hoare]
4,2,R,1998,3,[Bjarne Stroustrup]
5,2,R,1998,1,[Guido van Rossum]
6,2,R,1998,2,"[Ross Ihaka, Robert Gentleman]"
7,2,R,1998,4,[Graydon Hoare]
8,3,C++,1985,3,[Bjarne Stroustrup]
9,3,C++,1985,1,[Guido van Rossum]


## Broadcast Join

Used to join a big table to a small table.

Small table should be small enough to entirely fit in a worker.

In [67]:
# spark might automatically do a broadcast join
# in this case, does not do broadcast join even when the tables are small
langdevs.join(langyear, langdevs.id == langyear.id, 'inner').explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [id#52L], [id#46L], Inner
   :- Sort [id#52L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(id#52L, 200), ENSURE_REQUIREMENTS, [id=#2350]
   :     +- Filter isnotnull(id#52L)
   :        +- Scan ExistingRDD[id#52L,devs#53]
   +- Sort [id#46L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(id#46L, 200), ENSURE_REQUIREMENTS, [id=#2351]
         +- Filter isnotnull(id#46L)
            +- Scan ExistingRDD[id#46L,lang#47,year#48L]




In [66]:
# give spark a hint to do broadcast join
# spark can ignore the hint
spark.sql('''select /*+ mapjoin(langdevs) */
                *
            from
                langdevs
            inner join
                langyear
            on
                langdevs.id = langyear.id''').explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [id#52L], [id#46L], Inner, BuildLeft, false
   :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [id=#2327]
   :  +- Filter isnotnull(id#52L)
   :     +- Scan ExistingRDD[id#52L,devs#53]
   +- Filter isnotnull(id#46L)
      +- Scan ExistingRDD[id#46L,lang#47,year#48L]




In [68]:
# spark.stop()