In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = (SparkSession
        .builder
        .appName("FunctionsCh05")
        .getOrCreate())

# User-Defined Functions
Spark allows for engineers to build own functions aka User-Defined Functions (UDFs)

## Spark SQL UDFs

In [3]:
### First create a function
def cubed(s):
    return s * s * s    

In [4]:
### Then we'll register this UDF with the session. Will persist only for this session.
spark.udf.register("cubed", cubed, LongType())

<function __main__.cubed(s)>

In [5]:
### Create temp view with range 1-9
spark.range(1,9).createOrReplaceTempView("udf_test")

In [6]:
### Now can query and even run UDF within a select statement.
spark.sql("SELECT id, cubed(id) AS id_cubed FROM udf_test").show()

+---+--------+
| id|id_cubed|
+---+--------+
|  1|       1|
|  2|       8|
|  3|      27|
|  4|      64|
|  5|     125|
|  6|     216|
|  7|     343|
|  8|     512|
+---+--------+



## Pandas UDFs
Downside to PySpark UDFs: Very expensive/slow b/c they run one row at a time. Resolved by introducing Pandas UDFs that run on Apache Arrow.

In [7]:
import pandas as pd

### Requirement: pip install pyarrow

In [8]:
### Again define our function
def cubed(a: pd.Series) -> pd.Series:
    return a * a * a

In [9]:
### Create our pandas DF with our cubed function.
cubed_udf = pandas_udf(cubed, returnType=LongType())

#### Here we can create a series with pandas and execute our function with that local pandas data

In [10]:
### Create the series
x = pd.Series([1,2,3])

### Execute our function
print(cubed(x))

0     1
1     8
2    27
dtype: int64


#### Now let's execute this function with a Spark DataFrame.

In [11]:
### Create the series
df = spark.range(1,4)

In [12]:
### Execute with Spark
df.select("id", cubed_udf(col("id"))).show()

+---+---------+
| id|cubed(id)|
+---+---------+
|  1|        1|
|  2|        8|
|  3|       27|
+---+---------+



Can see with the Spark UI http://localhost:4040/jobs/ that we actually created and executed a spark job to run this calc.

# Higher-Order Functions
Take anonymous lambda functions as arguments

In [13]:
### Create sample data set so we can run some examples
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])

t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]
t_c = spark.createDataFrame(t_list, schema)
t_c.createOrReplaceTempView("tC")

In [14]:
t_c.show()

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



### transform()
Produces an array by applying a function to each element of the input array

In [15]:
### Calculate Fahrenheit from Celsius for an array of temperatures
spark.sql("""SELECT celsius,
                transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit
             FROM tC""").show()

+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



### filter()
Produces an array consisting of only the elements of the input array for which the Boolean function is true

In [16]:
### Filter temperatures > 38C for array of temperatures
spark.sql("""SELECT celsius,
                filter(celsius, t -> t > 38) as high
             FROM tC""").show()

+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



### exists()
Returns true if the Boolean function holds for any element in the input array

In [17]:
### Is there a temperature of 38C in the array of temperatures?
spark.sql("""SELECT celsius,
                exists(celsius, t -> t = 38) as threshold
             FROM tC""").show()

+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|     true|
|[31, 32, 34, 55, 56]|    false|
+--------------------+---------+



## aggregate()
Reduces the elements of the array into a single value. Originally the reduce() function.

In [25]:
### requirement
import functools

In [27]:
### Calculate average temperature and convert to F
spark.sql("""SELECT celsius,
                aggregate(
                celsius,
                0,
                (t, acc) -> t + acc,
                acc -> (acc div size(celsius) * 9 div 5) + 32
                ) as avgFahrenheit
            FROM tC""").show()

+--------------------+-------------+
|             celsius|avgFahrenheit|
+--------------------+-------------+
|[35, 36, 32, 30, ...|           96|
|[31, 32, 34, 55, 56]|          105|
+--------------------+-------------+



# Common DataFrames and Spark SQL Operations

In [28]:
###pg 144
spark.stop()