# PYTHON

PANDAS UDF

In [7]:
import pandas as pd

In [8]:
from pyspark.sql.functions import col, pandas_udf

In [9]:
from pyspark.sql.types import LongType

In [10]:
def cubed(a: pd.Series) -> pd.Series:
    return a*a*a

In [11]:
cubed_udf = pandas_udf(cubed, returnType=LongType())

In [12]:
x = pd.Series([1, 2, 3])

In [14]:
print(cubed(x))

0     1
1     8
2    27
dtype: int64


In [15]:
df = spark.range(1, 4)

In [16]:
df.select("id", cubed_udf(col("id"))).show()

+---+---------+
| id|cubed(id)|
+---+---------+
|  1|        1|
|  2|        8|
|  3|       27|
+---+---------+



HIGH ORDER-FUNCTIONS

In [17]:
from pyspark.sql.types import *

In [18]:
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])

In [19]:
t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]

In [20]:
t_c = spark.createDataFrame(t_list, schema)

In [21]:
t_c.createOrReplaceTempView("tC")

In [24]:
t_c.show(10,False)

+----------------------------+
|celsius                     |
+----------------------------+
|[35, 36, 32, 30, 40, 42, 38]|
|[31, 32, 34, 55, 56]        |
+----------------------------+



In [26]:
spark.sql("""SELECT celsius, transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit FROM tC""").show(10, False)

+----------------------------+-------------------------------+
|celsius                     |fahrenheit                     |
+----------------------------+-------------------------------+
|[35, 36, 32, 30, 40, 42, 38]|[95, 96, 89, 86, 104, 107, 100]|
|[31, 32, 34, 55, 56]        |[87, 89, 93, 131, 132]         |
+----------------------------+-------------------------------+



In [27]:
spark.sql("""SELECT celsius, filter(celsius, t -> t > 38) as high FROM tC""").show(10, False)

+----------------------------+--------+
|celsius                     |high    |
+----------------------------+--------+
|[35, 36, 32, 30, 40, 42, 38]|[40, 42]|
|[31, 32, 34, 55, 56]        |[55, 56]|
+----------------------------+--------+



In [29]:
spark.sql("""SELECT celsius, exists(celsius, t -> t = 38) as threshold FROM tC""").show(10, False)

+----------------------------+---------+
|celsius                     |threshold|
+----------------------------+---------+
|[35, 36, 32, 30, 40, 42, 38]|true     |
|[31, 32, 34, 55, 56]        |false    |
+----------------------------+---------+



In [31]:
# spark.sql("""SELECT celsius, reduce(celsius, 0, (t, acc) -> t + acc, acc -> (acc div size(celsius) * 9 div 5) + 32) as avgFahrenheit FROM tC""").show(10, False)