In [1]:
import os

# Set SPARK_HOME and JAVA_HOME environment variables
os.environ['SPARK_HOME'] = '/usr/local/Cellar/apache-spark/3.5.1/libexec'
os.environ['JAVA_HOME'] = '/usr/local/opt/openjdk/libexec/openjdk.jdk/Contents/Home'

In [2]:
# Partial Bucket Sort

from pyspark.sql.functions import col, udf, row_number
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Sample Spark Program") \
    .getOrCreate()

# Data with two columns
data = [(29, 'A'), (25, 'B'), (3, 'C'), (49, 'A'), (9, 'B'), (37, 'C'), (21, 'A'), (43, 'B')]
df = spark.createDataFrame(data, ["value", "category"])

# Number of buckets - this can be parameterized or set based on your requirement
num_buckets = 5

# Calculate the maximum value from the DataFrame
max_value = df.agg({"value": "max"}).collect()[0][0]

# Example function to assign buckets
def assign_bucket(value, num_buckets, max_value):
    size = max_value / num_buckets
    return int(value / size)

# Register UDF with the computed max_value
bucket_udf = udf(lambda x: assign_bucket(x, num_buckets, max_value), IntegerType())

# Adding a bucket column
df = df.withColumn("bucket", bucket_udf(col("value")))

# Repartition based on the bucket column
df = df.repartition("bucket")

# Sort within each partition
# df = df.sortWithinPartitions("value")

# Define window specification
window_spec = Window.partitionBy("bucket").orderBy("value")

# Apply a window function like row_number to add a sequential row number within each partition
df = df.withColumn("row_number", row_number().over(window_spec))

# Collecting results to show
sorted_df = df.drop("bucket").collect()

# Print the results
for row in sorted_df:
    print(row)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/01 15:43:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/01 15:43:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.

Row(value=3, category='C', row_number=1)
Row(value=9, category='B', row_number=2)
Row(value=21, category='A', row_number=1)
Row(value=25, category='B', row_number=2)
Row(value=29, category='A', row_number=3)
Row(value=37, category='C', row_number=1)
Row(value=43, category='B', row_number=1)
Row(value=49, category='A', row_number=1)


                                                                                

24/08/01 15:44:08 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
