# User Defined Functions

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("udf").getOrCreate()

In [32]:
transactions = [
    ('찹쌀탕수육+짜장2', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('등심탕수육+크립새우+짜장면', '2021-10-24 11:19:00', 21500, 'KRW'), 
    ('월남 쌈 2인 세트', '2021-07-25 11:12:40', 42000, 'KRW'), 
    ('콩국수+열무비빔국수', '2021-07-10 08:20:00', 21250, 'KRW'), 
    ('장어소금+고추장구이', '2021-07-01 05:36:00', 68700, 'KRW'), 
    ('족발', '2020-08-19 19:04:00', 32000, 'KRW'),  
]

schema = ["name", "datetime", "price", "currency"]
df = spark.createDataFrame(data=transactions, schema=schema)

In [33]:
df.createOrReplaceTempView("transactions")

In [42]:
spark.sql("SELECT * FROM transactions").show()

+--------------------------+-------------------+-----+--------+
|                      name|           datetime|price|currency|
+--------------------------+-------------------+-----+--------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|22000|     KRW|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|21500|     KRW|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|       장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
|                      족발|2020-08-19 19:04:00|32000|     KRW|
+--------------------------+-------------------+-----+--------+



# Basics

In [51]:
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType 

# 방법 1
def squared1(s):
    return s * s

spark.udf.register("squared1", squared1, LongType())

# 방법 2
@udf("long")
def squared2(s):
    return s * s

spark.udf.register("squared2", squared2)

<function __main__.squared2(s)>

# Advanced 

In [37]:
def read_number(n):
    units = ["","십","백","천","만"]
    nums = '일이삼사오육칠팔구'
    result = []
    i = 0
    while n>0:
        n,r = divmod(n, 10)
        if r>0:
            result.append(nums[r-1]+units[i])
        i+= 1
    return "".join(result[::-1])


print(read_number(21250))
print(read_number(68700))

이만일천이백오십
육만팔천칠백


In [52]:
spark.sql("SELECT name, squared1(price), squared2(price) from transactions").show()

+--------------------------+---------------+---------------+
|                      name|squared1(price)|squared2(price)|
+--------------------------+---------------+---------------+
|          찹쌀탕수육+짜장2|      484000000|      484000000|
|등심탕수육+크립새우+짜장면|      462250000|      462250000|
|          월남 쌈 2인 세트|     1764000000|     1764000000|
|       콩국수+열무비빔국수|      451562500|      451562500|
|       장어소금+고추장구이|     4719690000|     4719690000|
|                      족발|     1024000000|     1024000000|
+--------------------------+---------------+---------------+



In [39]:
spark.udf.register("read_number", read_number)

<function __main__.read_number(n)>

In [44]:
# default return type is string type so specify type if you don't want string
from pyspark.sql.types import StringType 
spark.udf.register("read_number", read_number, StringType())

<function __main__.read_number(n)>

In [45]:
spark.sql("SELECT name, read_number(price) from transactions").show()

+--------------------------+------------------+
|                      name|read_number(price)|
+--------------------------+------------------+
|          찹쌀탕수육+짜장2|          이만이천|
|등심탕수육+크립새우+짜장면|      이만일천오백|
|          월남 쌈 2인 세트|          사만이천|
|       콩국수+열무비빔국수|  이만일천이백오십|
|       장어소금+고추장구이|      육만팔천칠백|
|                      족발|          삼만이천|
+--------------------------+------------------+



In [59]:
def get_weekday(date):
    import calendar 
    return calendar.day_name[date.weekday()]

spark.udf.register('get_weekday', get_weekday)


query = """
SELECT
    datetime, 
    get_weekday(TO_DATE(datetime)) AS day_of_week
FROM
    transactions
"""

spark.sql(query).show()

+-------------------+-----------+
|           datetime|day_of_week|
+-------------------+-----------+
|2021-11-07 13:20:00|     Sunday|
|2021-10-24 11:19:00|     Sunday|
|2021-07-25 11:12:40|     Sunday|
|2021-07-10 08:20:00|   Saturday|
|2021-07-01 05:36:00|   Thursday|
|2020-08-19 19:04:00|  Wednesday|
+-------------------+-----------+

