## Setup

In [1]:
from pyspark.sql import SparkSession
import os
import sys

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder.getOrCreate()
spark

data from https://www.kaggle.com/manjeetsingh/retaildataset

In [3]:
sales = spark.read.load('sales data-set.csv',
                    format='csv', 
                    header='true', 
                    inferSchema='true')
sales.show(5)

+-----+----+----------+------------+---------+
|Store|Dept|      Date|Weekly_Sales|IsHoliday|
+-----+----+----------+------------+---------+
|    1|   1|05/02/2010|     24924.5|    false|
|    1|   1|12/02/2010|    46039.49|     true|
|    1|   1|19/02/2010|    41595.55|    false|
|    1|   1|26/02/2010|    19403.54|    false|
|    1|   1|05/03/2010|     21827.9|    false|
+-----+----+----------+------------+---------+
only showing top 5 rows



In [4]:
stores = spark.read.load('stores data-set.csv',
                    format='csv', 
                    header='true', 
                    inferSchema='true')
stores.show(5)

+-----+----+------+
|Store|Type|  Size|
+-----+----+------+
|    1|   A|151315|
|    2|   A|202307|
|    3|   B| 37392|
|    4|   A|205863|
|    5|   B| 34875|
+-----+----+------+
only showing top 5 rows



## Broadcast join

- https://mungingdata.com/apache-spark/broadcast-joins/
- https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.Broadcast.html\
- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.join.html

In [5]:
from pyspark.sql.functions import broadcast

In [6]:
joined = sales.join(
    broadcast(stores),
    "Store",
    "inner"
)
joined.show(5)

+-----+----+----------+------------+---------+----+------+
|Store|Dept|      Date|Weekly_Sales|IsHoliday|Type|  Size|
+-----+----+----------+------------+---------+----+------+
|    1|   1|05/02/2010|     24924.5|    false|   A|151315|
|    1|   1|12/02/2010|    46039.49|     true|   A|151315|
|    1|   1|19/02/2010|    41595.55|    false|   A|151315|
|    1|   1|26/02/2010|    19403.54|    false|   A|151315|
|    1|   1|05/03/2010|     21827.9|    false|   A|151315|
+-----+----+----------+------------+---------+----+------+
only showing top 5 rows



In [7]:
#joined.explain()

In [8]:
#stores.explain()

In [9]:
#sales.explain()