In [2]:
# User Defined Function은 sql에서 사용할 수 있는 함수라고 생각 
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("udf").getOrCreate()
# 세션 생성 

22/10/04 21:15:29 WARN Utils: Your hostname, Moon-2.local resolves to a loopback address: 127.0.0.1; using 192.168.0.4 instead (on interface en0)
22/10/04 21:15:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 21:15:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
transactions = [
    ('찹쌀탕수육+짜장2', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('등심탕수육+크립새우+짜장면', '2021-10-24 11:19:00', 21500, 'KRW'), 
    ('월남 쌈 2인 세트', '2021-07-25 11:12:40', 42000, 'KRW'), 
    ('콩국수+열무비빔국수', '2021-07-10 08:20:00', 21250, 'KRW'), 
    ('장어소금+고추장구이', '2021-07-01 05:36:00', 68700, 'KRW'), 
    ('족발', '2020-08-19 19:04:00', 32000, 'KRW'),  
]

schema = ["name", "datetime", "price", "currency"]

In [5]:
df = spark.createDataFrame(data = transactions, schema = schema)

In [6]:
# sql에서 사용하기 위해 tempview 생성 
df.createOrReplaceTempView("transactions")

In [7]:
spark.sql("select * from transactions").show()

                                                                                

+--------------------------+-------------------+-----+--------+
|                      name|           datetime|price|currency|
+--------------------------+-------------------+-----+--------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|22000|     KRW|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|21500|     KRW|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|       장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
|                      족발|2020-08-19 19:04:00|32000|     KRW|
+--------------------------+-------------------+-----+--------+



## UDF

### 첫번째 방법

In [8]:
def squared(n):
    return n * n
# 파이썬의 함수를 만들어야함
spark.udf.register("squared", squared)
# spark 세션에서 udf의 register라는 함수를 통해 정의할 수 있음 
# 파라미터 (sql에서 사용할 이름, 실제 참고할 함수)


<function __main__.squared(n)>

In [10]:
spark.sql("select price, squared(price) from transactions").show()
# 신기하네 
# 적용됨 
spark.sql("select price, squared(price) from transactions").printSchema()
# 제곱된 값의 리턴이 string으로 변환됨, 리턴 타입을 지정하지 않으면 자동으로 String 

+-----+--------------+
|price|squared(price)|
+-----+--------------+
|22000|     484000000|
|21500|     462250000|
|42000|    1764000000|
|21250|     451562500|
|68700|    4719690000|
|32000|    1024000000|
+-----+--------------+

root
 |-- price: long (nullable = true)
 |-- squared(price): string (nullable = true)



In [11]:
from pyspark.sql.types import LongType 

spark.udf.register("squared", squared, LongType())
# 파라미터 (sql에서 사용할 이름, 실제 참고할 함수, 리턴할 변수 타입 )


22/10/04 21:21:00 WARN SimpleFunctionRegistry: The function squared replaced a previously registered function.


<function __main__.squared(n)>

In [12]:
spark.sql("select price, squared(price) from transactions").show()
# 신기하네 
# 적용됨 
spark.sql("select price, squared(price) from transactions").printSchema()
# 리턴 타입이 달라졌음 

+-----+--------------+
|price|squared(price)|
+-----+--------------+
|22000|     484000000|
|21500|     462250000|
|42000|    1764000000|
|21250|     451562500|
|68700|    4719690000|
|32000|    1024000000|
+-----+--------------+

root
 |-- price: long (nullable = true)
 |-- squared(price): long (nullable = true)



### 두번째 방법

In [13]:
# 두번째 방법 
from pyspark.sql.functions import udf 

@udf("long")
def squared(n):
    return n * n
# 이렇게 작성하면 sql에서 사용할 수 있는 함수를 쉽게 정의할 수 있음 

In [14]:
spark.sql("select price, squared(price) from transactions").show()
spark.sql("select price, squared(price) from transactions").printSchema()

+-----+--------------+
|price|squared(price)|
+-----+--------------+
|22000|     484000000|
|21500|     462250000|
|42000|    1764000000|
|21250|     451562500|
|68700|    4719690000|
|32000|    1024000000|
+-----+--------------+

root
 |-- price: long (nullable = true)
 |-- squared(price): long (nullable = true)



In [18]:
def read_number(n):
    units = ["", "십", "백", "천", "만"]
    nums = '일이삼사오육칠팔구'
    result = []
    i = 0 
    while n > 0:
        n, r = divmod(n, 10)
        # n을 10으로 나눴을 때 몫과 나머지 
        if r > 0:
            result.append(nums[r-1] + units[i])
            # 계속 반복하면서 연산결과에 따라 한글로 변환할 수 있음 
        i += 1
    return "".join(reversed(result))
print(read_number(68700))
print(read_number(21250))

육만팔천칠백
이만일천이백오십


In [19]:
spark.udf.register("read_number", read_number)
# sql에서 사용할 함수 등록 

<function __main__.read_number(n)>

In [20]:
spark.sql("select price, read_number(price) from transactions").show()

+-----+------------------+
|price|read_number(price)|
+-----+------------------+
|22000|          이만이천|
|21500|      이만일천오백|
|42000|          사만이천|
|21250|  이만일천이백오십|
|68700|      육만팔천칠백|
|32000|          삼만이천|
+-----+------------------+



In [22]:
def get_weekday(date):
    import calendar
    return calendar.day_name[date.weekday()]
spark.udf.register("get_weekday", get_weekday)

<function __main__.get_weekday(date)>

In [25]:
query="""
select 
    datetime,
    get_weekday(TO_DATE(datetime)) as day_of_week
from 
    transactions
"""
spark.sql(query).show()
# 오... 진짜 신기하네 

+-------------------+-----------+
|           datetime|day_of_week|
+-------------------+-----------+
|2021-11-07 13:20:00|     Sunday|
|2021-10-24 11:19:00|     Sunday|
|2021-07-25 11:12:40|     Sunday|
|2021-07-10 08:20:00|   Saturday|
|2021-07-01 05:36:00|   Thursday|
|2020-08-19 19:04:00|  Wednesday|
+-------------------+-----------+

