In [0]:
from pyspark.sql.functions import *
from pyspark.sql import *

In [0]:
df = spark.read.format("csv")\
    .option("header","true")\
    .option("inferSchema","true")\
    .option("mode","PERMISSIVE")\
    .load("/Volumes/workspace/default/rajatlearningdata/PySpark_data/flight_csv_data.csv")

df.show(3)
df.printSchema()

In [0]:
df.count()

Lets Find No of Partitions our data

In [0]:
df.rdd.getNumPartitions()

- Above error is expected behavior on Databricks Serverless compute. If it is not a Serverless Compute then it will work fine.
- RDD APIs are intentionally blocked, even for simple calls like getNumPartitions()
- What Databricks Serverless does:  
❌ Disables all RDD APIs  
❌ No .rdd, .map(), .collect() on RDD  
✅ Allows DataFrame / SQL APIs only

But to find no. of partitions we can try an indirect approach

In [0]:
from pyspark.sql.functions import spark_partition_id

df.select(spark_partition_id()) \
  .distinct() \
  .count()


Now Lets re-partition this and create 4 repartitions

In [0]:
partitioned_df = df.repartition(4)

# Now No. Of Partitions

partitioned_df.select(spark_partition_id()).distinct().count()


Lets See How Much Data went to each Partitions

In [0]:
partitioned_df.withColumn("partition_ID",spark_partition_id()).groupBy("partition_ID").count().show()

Let's Do repartition on column : repartition(no_of_partition,"col_Name")  ---OR--- repartition("col_Name") 

In [0]:
column_partition_df1 = df.repartition(300,"ORIGIN_COUNTRY_NAME")


In [0]:
# No Of partitions

column_partition_df1.select(spark_partition_id()).distinct().count()

- Above it is creating only 100 Partitions in-place of 300 because it is a serverless compute
- If it was a non-serverless compute it will create 300 partitions , with some partition as empty partition (because we have only 256 records)

Lets See How Many Records are in Each Partition

In [0]:
column_partition_df1.withColumn("partition_ID",spark_partition_id()).groupBy("partition_ID").count().show()

### Coalesce()

In [0]:
# create 8 partition
df2 = df.repartition(8)

df2.select(spark_partition_id()).distinct().count()

# Data Distribution in each partition
df2.withColumn("partition_id",spark_partition_id()).groupBy("partition_id").count().show()


In Coalesce Data Distribution might not be Equal

In [0]:
# Now Coalese the no. of partitions to only 3 partitions

df3 = df2.coalesce(3)

df3.select(spark_partition_id()).distinct().count()

# Data Distribution in each partition
df3.withColumn("partition_id",spark_partition_id()).groupBy("partition_id").count().show()