In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = (SparkSession
        .builder
        .appName("FunctionsCh05")
        .getOrCreate())

# User-Defined Functions
Spark allows for engineers to build own functions aka User-Defined Functions (UDFs)

## Spark SQL UDFs

In [3]:
### First create a function
def cubed(s):
    return s * s * s    

In [4]:
### Then we'll register this UDF with the session. Will persist only for this session.
spark.udf.register("cubed", cubed, LongType())

<function __main__.cubed(s)>

In [5]:
### Create temp view with range 1-9
spark.range(1,9).createOrReplaceTempView("udf_test")

In [6]:
### Now can query and even run UDF within a select statement.
spark.sql("SELECT id, cubed(id) AS id_cubed FROM udf_test").show()

+---+--------+
| id|id_cubed|
+---+--------+
|  1|       1|
|  2|       8|
|  3|      27|
|  4|      64|
|  5|     125|
|  6|     216|
|  7|     343|
|  8|     512|
+---+--------+



## Pandas UDFs
Downside to PySpark UDFs: Very expensive/slow b/c they run one row at a time. Resolved by introducing Pandas UDFs that run on Apache Arrow.

In [7]:
import pandas as pd

### Requirement: pip install pyarrow

In [8]:
### Again define our function
def cubed(a: pd.Series) -> pd.Series:
    return a * a * a

In [9]:
### Create our pandas DF with our cubed function.
cubed_udf = pandas_udf(cubed, returnType=LongType())

#### Here we can create a series with pandas and execute our function with that local pandas data

In [10]:
### Create the series
x = pd.Series([1,2,3])

### Execute our function
print(cubed(x))

0     1
1     8
2    27
dtype: int64


#### Now let's execute this function with a Spark DataFrame.

In [11]:
### Create the series
df = spark.range(1,4)

In [12]:
### Execute with Spark
df.select("id", cubed_udf(col("id"))).show()

+---+---------+
| id|cubed(id)|
+---+---------+
|  1|        1|
|  2|        8|
|  3|       27|
+---+---------+



Can see with the Spark UI http://localhost:4040/jobs/ that we actually created and executed a spark job to run this calc.

# Higher-Order Functions
Take anonymous lambda functions as arguments

In [13]:
### Create sample data set so we can run some examples
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])

t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]
t_c = spark.createDataFrame(t_list, schema)
t_c.createOrReplaceTempView("tC")

In [14]:
t_c.show()

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



### transform()
Produces an array by applying a function to each element of the input array

In [15]:
### Calculate Fahrenheit from Celsius for an array of temperatures
spark.sql("""SELECT celsius,
                transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit
             FROM tC""").show()

+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



### filter()
Produces an array consisting of only the elements of the input array for which the Boolean function is true

In [16]:
### Filter temperatures > 38C for array of temperatures
spark.sql("""SELECT celsius,
                filter(celsius, t -> t > 38) as high
             FROM tC""").show()

+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



### exists()
Returns true if the Boolean function holds for any element in the input array

In [17]:
### Is there a temperature of 38C in the array of temperatures?
spark.sql("""SELECT celsius,
                exists(celsius, t -> t = 38) as threshold
             FROM tC""").show()

+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|     true|
|[31, 32, 34, 55, 56]|    false|
+--------------------+---------+



## aggregate()
Reduces the elements of the array into a single value. Originally the reduce() function.

In [18]:
### requirement
import functools

In [19]:
### Calculate average temperature and convert to F
spark.sql("""SELECT celsius,
                aggregate(
                celsius,
                0,
                (t, acc) -> t + acc,
                acc -> (acc div size(celsius) * 9 div 5) + 32
                ) as avgFahrenheit
            FROM tC""").show()

+--------------------+-------------+
|             celsius|avgFahrenheit|
+--------------------+-------------+
|[35, 36, 32, 30, ...|           96|
|[31, 32, 34, 55, 56]|          105|
+--------------------+-------------+



# Common DataFrames and Spark SQL Operations

In [24]:
tripdelaysFilePath = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/departuredelays.csv"
airportsnaFilePath = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/airport-codes-na.txt"

In [25]:
airportsna = (spark.read
                  .format("csv")
                  .options(header="true", inferSchema="true", sep="\t")
                  .load(airportsnaFilePath))

airportsna.createOrReplaceTempView("airports_na")

In [27]:
departureDelays = (spark.read
                      .format("csv")
                      .options(header="true")
                      .load(tripdelaysFilePath))

departureDalays = (departureDelays
                      .withColumn("delay", expr("CAST(delay as INT) as delay"))
                      .withColumn("distance", expr("CAST(distance as INT) as distance")))

departureDelays.createOrReplaceTempView("departureDelays")

In [28]:
foo = (departureDelays
          .filter(expr("""origin == 'SEA'
                          and destination == 'SFO'
                          and date like '01010%'
                          and delay > 0""")))

foo.createOrReplaceTempView("foo")

In [29]:
spark.sql("SELECT * FROM airports_na LIMIT 10").show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
+-----------+-----+-------+----+



In [30]:
spark.sql("SELECT * FROM departureDelays LIMIT 10").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+



In [31]:
spark.sql("SELECT * FROM foo").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



### Unions
Can accomplish with both dataframe or sql syntax

In [32]:
bar = departureDelays.union(foo)
bar.createOrReplaceTempView("bar")

In [33]:
bar.filter(expr("""origin == 'SEA'
                    and destination == 'SFO'
                    and date LIKE '01010%'
                    and delay > 0""")).show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



In [40]:
spark.sql("""SELECT *
             FROM bar
             WHERE 1=1
                 and origin = 'SEA'
                 and destination = 'SFO'
                 and date LIKE '01010%'
                 and delay > 0""").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



### Joins
Again can accomplish with both df & sql syntax

In [43]:
foo.join(airportsna, airportsna.IATA == foo.origin
        ).select("City", "State", "date", "delay", "distance", "destination").show()

+-------+-----+--------+-----+--------+-----------+
|   City|State|    date|delay|distance|destination|
+-------+-----+--------+-----+--------+-----------+
|Seattle|   WA|01010710|   31|     590|        SFO|
|Seattle|   WA|01010955|  104|     590|        SFO|
|Seattle|   WA|01010730|    5|     590|        SFO|
+-------+-----+--------+-----+--------+-----------+



In [41]:
spark.sql("""
SELECT a.City, a.State, f.date, f.delay, f.distance, f.destination
FROM foo f
JOIN airports_na a
ON a.IATA = f.origin
""").show()

+-------+-----+--------+-----+--------+-----------+
|   City|State|    date|delay|distance|destination|
+-------+-----+--------+-----+--------+-----------+
|Seattle|   WA|01010710|   31|     590|        SFO|
|Seattle|   WA|01010955|  104|     590|        SFO|
|Seattle|   WA|01010730|    5|     590|        SFO|
+-------+-----+--------+-----+--------+-----------+



### Window Functions

In [50]:
spark.sql("""SELECT date
                ,delay
                ,distance
                , origin
                ,destination
                ,rank
             FROM (
                 SELECT date
                     ,delay
                     ,distance
                     ,origin
                     ,destination
                     ,dense_rank() OVER (
                                     PARTITION BY origin
                                     ORDER BY delay DESC) as rank
                 FROM bar
                 WHERE 1=1
                     and delay > 0) t
             WHERE 1=1
                 and rank = 1
                 """).show(10)

+--------+-----+--------+------+-----------+----+
|    date|delay|distance|origin|destination|rank|
+--------+-----+--------+------+-----------+----+
|01261305|    9|     311|   BTM|        SLC|   1|
|02011305|    9|     311|   BTM|        SLC|   1|
|02062025|   99|     710|   BUR|        PDX|   1|
|01051715|   98|    1064|   CAK|        DEN|   1|
|01081337|   93|     139|   CDV|        ANC|   1|
|01071710|   97|      48|   CEC|        ACV|   1|
|01041225|   98|     397|   CHO|        ATL|   1|
|02070921|   98|     602|   CID|        DEN|   1|
|01070840|   99|     420|   CMH|        JFK|   1|
|01161525|   99|     416|   CMH|        LGA|   1|
+--------+-----+--------+------+-----------+----+
only showing top 10 rows



### Modifications to DataFrames
Remember that DFs are immutable, but we can create a new DF with changes that we need.

#### Adding Columns
Add a column designating on-time/delay status to foo table

In [52]:
foo2 = (foo.withColumn(
            "status",
            expr("CASE WHEN delay <= 10 THEN 'On-time' ELSE 'Delayed' END")
        ))

foo2.show()

+--------+-----+--------+------+-----------+-------+
|    date|delay|distance|origin|destination| status|
+--------+-----+--------+------+-----------+-------+
|01010710|   31|     590|   SEA|        SFO|Delayed|
|01010955|  104|     590|   SEA|        SFO|Delayed|
|01010730|    5|     590|   SEA|        SFO|On-time|
+--------+-----+--------+------+-----------+-------+



#### Dropping Columns
No longer need delay length column, let's drop it.

In [53]:
foo3 = foo2.drop("delay")
foo3.show()

+--------+--------+------+-----------+-------+
|    date|distance|origin|destination| status|
+--------+--------+------+-----------+-------+
|01010710|     590|   SEA|        SFO|Delayed|
|01010955|     590|   SEA|        SFO|Delayed|
|01010730|     590|   SEA|        SFO|On-time|
+--------+--------+------+-----------+-------+



#### Renaming Columns

In [54]:
foo4 = foo3.withColumnRenamed("status", "flight_status")
foo4.show()

+--------+--------+------+-----------+-------------+
|    date|distance|origin|destination|flight_status|
+--------+--------+------+-----------+-------------+
|01010710|     590|   SEA|        SFO|      Delayed|
|01010955|     590|   SEA|        SFO|      Delayed|
|01010730|     590|   SEA|        SFO|      On-time|
+--------+--------+------+-----------+-------------+



In [55]:
spark.stop()