- **Name:** 19_pyspark_udfs
- **Author:** Shamas Imran
- **Desciption:** Creating and applying PySpark UDFs
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Defined user-defined functions in PySpark  
                                              Registered UDFs with Spark SQL  
                                              Applied UDFs in DataFrame transformations  
-->

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType
import pandas as pd

spark = SparkSession.builder.appName("PySparkUDFs").getOrCreate()

In [0]:
# Example function: categorize score
def categorize_score(score):
    if score >= 90:
        return "Excellent"
    elif score >= 75:
        return "Good"
    else:
        return "Average"

# Register as UDF
# It’s just a Python variable (categorize_udf) that holds a Spark UDF object.
categorize_udf = F.udf(categorize_score, StringType())

# Sample DataFrame
df = spark.createDataFrame([(1, 95), (2, 80), (3, 60)], ["id", "score"])

# Apply UDF in DataFrame API
df.withColumn("category", categorize_udf("score")).show()


In [0]:
# ---------------------------------------------------------
# Using UDFs in Spark SQL
# ---------------------------------------------------------

# Register as SQL UDF
spark.udf.register("categorize_sql", categorize_score, StringType())

df.createOrReplaceTempView("scores")

spark.sql("""
    SELECT id, score, categorize_sql(score) AS category
    FROM scores
""").show()


In [0]:
# ---------------------------------------------------------
# Regular UDFs are slow because:
#   - Each row moves from JVM -> Python -> JVM (serialization overhead)
#   - Break Spark's Catalyst optimizer (no predicate pushdown, no pruning)
# Best Practice: Always try Spark built-in functions first (F.when, F.concat, F.udf)
# ---------------------------------------------------------

df.withColumn("category_builtin",
              F.when(F.col("score") >= 90, "Excellent")
               .when(F.col("score") >= 75, "Good")
               .otherwise("Average")
             ).show()


In [0]:
# ---------------------------------------------------------
# Pandas UDFs (a.k.a. Vectorized UDFs)
# - Introduced to overcome performance issues of regular UDFs
# - Work on batches (vectorized) instead of row-by-row
# - Use Apache Arrow for efficient data exchange
# ---------------------------------------------------------

from pyspark.sql.functions import pandas_udf

# Define Pandas UDF
@pandas_udf(StringType())
def categorize_pandas_udf(score: pd.Series) -> pd.Series:
    return score.apply(lambda x: "Excellent" if x >= 90 else "Good" if x >= 75 else "Average")

# Apply Pandas UDF
df.withColumn("category_pandas", categorize_pandas_udf("score")).show()


In [0]:
# ---------------------------------------------------------
# Best Practices:
#   - Prefer built-in Spark SQL functions (optimized in Catalyst)
#   - Use regular UDFs only if no built-in function exists
#   - For heavy computation, prefer Pandas UDFs (vectorized)
#   - Test performance: explain() can show differences
# ---------------------------------------------------------

# Compare plans
df.withColumn("category_builtin",
              F.when(F.col("score") >= 90, "Excellent")
               .when(F.col("score") >= 75, "Good")
               .otherwise("Average")
             ).explain()

df.withColumn("category_pandas", categorize_pandas_udf("score")).explain()
