### Partition Tutorial

In [None]:
df = spark.table("workspace.default.movies")
display(df.limit(5))

title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali
Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English
Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English
Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English
Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English


In [None]:
%sql
select distinct studio from workspace.default.movies

studio
Government of West Bengal
Marvel Studios
Castle Rock Entertainment
Warner Bros. Pictures
Columbia Pictures
Universal Pictures
Paramount Pictures
Liberty Films
20th Century Fox
Syncopy


### Partition by Key

In [None]:
# Partition by a key (better data grouping for future groupBy/join on that key)
rep_by_key = df.repartition(6, "studio")  # Exchange hashpartitioning(studio, 6)
rep_by_key.explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (7)
+- == Initial Plan ==
   ColumnarToRow (6)
   +- PhotonResultStage (5)
      +- PhotonShuffleExchangeSource (4)
         +- PhotonShuffleMapStage (3)
            +- PhotonShuffleExchangeSink (2)
               +- PhotonScan parquet workspace.default.movies (1)


(1) PhotonScan parquet workspace.default.movies
Output [10]: [title#11597, industry#11598, release_year#11599L, imdb_rating#11600, studio#11601, budget#11602, revenue#11603, unit#11604, currency#11605, language#11606]
Location: PreparedDeltaFileIndex [s3://dbstorage-prod-ftgok/uc/79a99d11-bc4e-43f0-a401-b12e20be6025/fba37f23-14f9-4927-9ade-961e9f768757/__unitystorage/catalogs/175a67df-9974-43f3-a33a-f690ccef30f2/tables/bd69b1e4-307f-4ceb-8369-e74cf80f21d7]
ReadSchema: struct<title:string,industry:string,release_year:bigint,imdb_rating:string,studio:string,budget:double,revenue:double,unit:string,currency:string,language:string>

(2) PhotonShuffleExchangeSink
Input [10]: [title#11597, in

### Partition in Round Robin Fashion

In [None]:
# 1) Make 6 compute partitions (round-robin shuffle)
rep_rr = df.repartition(6)
rep_rr.count()

37

In [None]:
rep_rr.explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (8)
+- == Initial Plan ==
   ColumnarToRow (7)
   +- PhotonResultStage (6)
      +- PhotonShuffleExchangeSource (5)
         +- PhotonShuffleMapStage (4)
            +- PhotonShuffleExchangeSink (3)
               +- PhotonSort (2)
                  +- PhotonScan parquet workspace.default.movies (1)


(1) PhotonScan parquet workspace.default.movies
Output [10]: [title#11314, industry#11315, release_year#11316L, imdb_rating#11317, studio#11318, budget#11319, revenue#11320, unit#11321, currency#11322, language#11323]
Location: PreparedDeltaFileIndex [s3://dbstorage-prod-ftgok/uc/79a99d11-bc4e-43f0-a401-b12e20be6025/fba37f23-14f9-4927-9ade-961e9f768757/__unitystorage/catalogs/175a67df-9974-43f3-a33a-f690ccef30f2/tables/bd69b1e4-307f-4ceb-8369-e74cf80f21d7]
ReadSchema: struct<title:string,industry:string,release_year:bigint,imdb_rating:string,studio:string,budget:double,revenue:double,unit:string,currency:string,language:string>

(2) PhotonSort
Input [

In [None]:
out_path = "/Volumes/workspace/default/partition_demo/repartition_6"
rep_rr.write.mode("overwrite").parquet(out_path)

Repartition by key helps if we have multiple per-studio operations such as below

In [None]:
from pyspark.sql import Window

# One-time shuffle:
base = df.repartition(6, "studio")

# Reuse partitioning on the same detailed rows:
agg    = base.groupBy("studio").agg(F.avg(F.col("revenue").cast("double")))
ranked = base.withColumn("rnk", F.row_number().over(Window.partitionBy("studio").orderBy(F.desc("revenue"))))

In [None]:
display(ranked)

title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language,rnk
Taare Zameen Par,Bollywood,2007,8.3,,120.0,1350.0,Millions,INR,Hindi,1
Parasite,Hollywood,2019,8.5,,15.5,263.1,Millions,USD,English,2
Bajirao Mastani,Bollywood,2015,7.2,,1.4,3.5,Billions,INR,Hindi,3
Avengers: Endgame,Hollywood,2019,8.4,Marvel Studios,400.0,2798.0,Millions,USD,English,1
Avengers: Infinity War,Hollywood,2018,8.4,Marvel Studios,400.0,2048.0,Millions,USD,English,2
Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English,3
Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English,4
Captain America: The Winter Soldier,Hollywood,2014,7.8,Marvel Studios,177.0,714.4,Millions,USD,English,5
Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English,6
Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English,7


In [None]:
display(agg)

studio,avg(CAST(revenue AS DOUBLE))
Marvel Studios,1131.825
Warner Bros. Pictures,701.8
Syncopy,1006.0
,538.8666666666667
Castle Rock Entertainment,73.3
Columbia Pictures,307.1
Dharma Productions,1155.0
Zee Studios,3409.0
Liberty Films,3.3
Vinod Chopra Films,4181.966666666666
