In [2]:
from pyspark.sql import SparkSession

In [6]:
spark = (
    SparkSession.builder
    .appName("bucketing example")
    .master("local[*]")
    .config("spark.driver.memory" , "10g")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("WARN")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold" , -1) 

In [7]:
spark

In [4]:
orders_data =  spark.read.csv("data/orders.csv", header= True, inferSchema = True)
orders_data.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- total_amount: integer (nullable = true)



In [5]:
products_data = spark.read.csv("data/products.csv", header = True, inferSchema = True)
products_data.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- stock: integer (nullable = true)



### Bucketing for joins

In [6]:
# Joining the dataframes without bucketing

enriched_orders = (
    orders_data.join(
        products_data,
        on = orders_data.product_id == products_data.product_id,
        how = "inner"
    )
)


In [7]:
enriched_orders.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [product_id#18], [product_id#46], Inner
   :- Sort [product_id#18 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(product_id#18, 200), ENSURE_REQUIREMENTS, [plan_id=60]
   :     +- Filter isnotnull(product_id#18)
   :        +- FileScan csv [order_id#17,product_id#18,customer_id#19,quantity#20,order_date#21,total_amount#22] Batched: false, DataFilters: [isnotnull(product_id#18)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/mnt/c/Users/prana/Downloads/spark books/spark-learning/data/orde..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id)], ReadSchema: struct<order_id:int,product_id:int,customer_id:int,quantity:int,order_date:timestamp,total_amount...
   +- Sort [product_id#46 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(product_id#46, 200), ENSURE_REQUIREMENTS, [plan_id=61]
         +- Filter isnotnull(product_id#46)
            +- FileScan csv [product_id#46,

In [8]:
# Bucketing the products and order data

(
    products_data
    .write
    .bucketBy(4,"product_id")
    .mode("overwrite")
    .saveAsTable("products_data")
)

(
    orders_data
    .write
    .bucketBy(4,"product_id")
    .mode("overwrite")
    .saveAsTable("orders_data")
)

                                                                                

In [9]:
# Reading the tables

products_data_buck = spark.table("products_data")
orders_data_buck = spark.table("orders_data")


In [10]:
# Joining the dataframes with bucketing

enriched_orders_buck = (
    orders_data_buck.join(
        products_data_buck,
        on = "product_id",
        how = "inner"
    )
    
)

enriched_orders_buck.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [product_id#121, order_id#120, customer_id#122, quantity#123, order_date#124, total_amount#125, product_name#109, category#110, brand#111, price#112, stock#113]
   +- SortMergeJoin [product_id#121], [product_id#108], Inner
      :- Sort [product_id#121 ASC NULLS FIRST], false, 0
      :  +- Filter isnotnull(product_id#121)
      :     +- FileScan parquet spark_catalog.default.orders_data[order_id#120,product_id#121,customer_id#122,quantity#123,order_date#124,total_amount#125] Batched: true, Bucketed: true, DataFilters: [isnotnull(product_id#121)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/mnt/c/Users/prana/Downloads/spark%20books/spark-learning/spark-w..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id)], ReadSchema: struct<order_id:int,product_id:int,customer_id:int,quantity:int,order_date:timestamp,total_amount..., SelectedBucketsCount: 4 out of 4
      +- Sort [product_id#108 ASC NULLS F

In [11]:
enriched_orders_buck = (
    orders_data_buck.join(
        products_data_buck,
        on = "product_id",
        how = "inner"
    )
    .where(orders_data_buck.product_id == 6)
)

enriched_orders_buck.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [product_id#121, order_id#120, customer_id#122, quantity#123, order_date#124, total_amount#125, product_name#109, category#110, brand#111, price#112, stock#113]
   +- SortMergeJoin [product_id#121], [product_id#108], Inner
      :- Sort [product_id#121 ASC NULLS FIRST], false, 0
      :  +- Filter (isnotnull(product_id#121) AND (product_id#121 = 6))
      :     +- FileScan parquet spark_catalog.default.orders_data[order_id#120,product_id#121,customer_id#122,quantity#123,order_date#124,total_amount#125] Batched: true, Bucketed: true, DataFilters: [isnotnull(product_id#121), (product_id#121 = 6)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/mnt/c/Users/prana/Downloads/spark%20books/spark-learning/spark-w..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id), EqualTo(product_id,6)], ReadSchema: struct<order_id:int,product_id:int,customer_id:int,quantity:int,order_date:timestamp,total_amount..., Se

### Bucketing for aggregations

In [12]:
from pyspark.sql.functions import sum

# Aggregating without Buckets

# Without filter 
orders_data_agg_enriched = (
    orders_data
    .groupBy("product_id")
    .agg(sum("total_amount").alias("total_sales_amount"))
    
)

orders_data_agg_enriched.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#18], functions=[sum(total_amount#22)])
   +- Exchange hashpartitioning(product_id#18, 200), ENSURE_REQUIREMENTS, [plan_id=186]
      +- HashAggregate(keys=[product_id#18], functions=[partial_sum(total_amount#22)])
         +- FileScan csv [product_id#18,total_amount#22] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/mnt/c/Users/prana/Downloads/spark books/spark-learning/data/orde..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<product_id:int,total_amount:int>




In [13]:
# With filter

orders_data_agg_enriched = (
    orders_data
    .groupBy("product_id")
    .agg(sum("total_amount").alias("total_sales_amount"))
    .where(orders_data.product_id == 6)
    
)

orders_data_agg_enriched.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#18], functions=[sum(total_amount#22)])
   +- Exchange hashpartitioning(product_id#18, 200), ENSURE_REQUIREMENTS, [plan_id=205]
      +- HashAggregate(keys=[product_id#18], functions=[partial_sum(total_amount#22)])
         +- Filter (isnotnull(product_id#18) AND (product_id#18 = 6))
            +- FileScan csv [product_id#18,total_amount#22] Batched: false, DataFilters: [isnotnull(product_id#18), (product_id#18 = 6)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/mnt/c/Users/prana/Downloads/spark books/spark-learning/data/orde..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id), EqualTo(product_id,6)], ReadSchema: struct<product_id:int,total_amount:int>




In [14]:
# Aggregating with Buckets

# Without filter 
orders_data_agg_enriched = (
    orders_data_buck
    .groupBy("product_id")
    .agg(sum("total_amount").alias("total_sales_amount"))
    
)

orders_data_agg_enriched.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#121], functions=[sum(total_amount#125)])
   +- HashAggregate(keys=[product_id#121], functions=[partial_sum(total_amount#125)])
      +- FileScan parquet spark_catalog.default.orders_data[product_id#121,total_amount#125] Batched: true, Bucketed: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/mnt/c/Users/prana/Downloads/spark%20books/spark-learning/spark-w..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<product_id:int,total_amount:int>, SelectedBucketsCount: 4 out of 4




In [15]:
# With filter

orders_data_agg_enriched = (
    orders_data_buck
    .groupBy("product_id")
    .agg(sum("total_amount").alias("total_sales_amount"))
    .where(orders_data_buck.product_id == 6)
    
)

orders_data_agg_enriched.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#121], functions=[sum(total_amount#125)])
   +- HashAggregate(keys=[product_id#121], functions=[partial_sum(total_amount#125)])
      +- Filter (isnotnull(product_id#121) AND (product_id#121 = 6))
         +- FileScan parquet spark_catalog.default.orders_data[product_id#121,total_amount#125] Batched: true, Bucketed: true, DataFilters: [isnotnull(product_id#121), (product_id#121 = 6)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/mnt/c/Users/prana/Downloads/spark%20books/spark-learning/spark-w..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id), EqualTo(product_id,6)], ReadSchema: struct<product_id:int,total_amount:int>, SelectedBucketsCount: 1 out of 4




In [16]:
# Droping the tables 

try : 
    spark.sql("""
    drop table orders_data
    """).show()
    
    spark.sql("""
    drop table products_data
    """).show()
    
    print("Tables are dropped")
except :
    print("Tables do not exist")

++
||
++
++

++
||
++
++

Tables are dropped


## How to determine bucket size

#### Ideal bucket size sould be around 128 - 200 MB

##### so to ge the ideal number of partitions :

##### total size of the dataset / Ideal Bucket size

##### Eg : 2000 MB / 200 MB = 10 buckets
