In [13]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("User Defined Functions")
    .master("spark://17e348267994:7077")
    .config("spark.executor.cores", 2)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

In [15]:
# Read employee data

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

emp = spark.read.format("csv").option("header", True).schema(emp_schema).load("/data/output/3/emp.csv")

emp.rdd.getNumPartitions()

2

In [19]:
# Create a function to generate 10% of Salary as Bonus

def bonus(salary):
    return int(salary) * 0.1

In [20]:
# Register as UDF
from pyspark.sql.functions import udf

bonus_udf = udf(bonus)

spark.udf.register("bonus_sql_udf", bonus, "double")


<function __main__.bonus(salary)>

In [24]:
# Create new column as bonus using UDF
from pyspark.sql.functions import expr

emp.withColumn("bonus", expr("bonus_sql_udf(salary)")).show()

+-----------+-------------+-------------+---+------+------+----------+------+
|employee_id|department_id|         name|age|gender|salary| hire_date| bonus|
+-----------+-------------+-------------+---+------+------+----------+------+
|        017|          105|  George Wang| 34|  Male| 57000|2016-03-15|5700.0|
|        018|          104|    Nancy Liu| 29|Female| 50000|2017-06-01|5000.0|
|        019|          103|  Steven Chen| 36|  Male| 62000|2015-08-01|6200.0|
|        020|          102|    Grace Kim| 32|Female| 53000|2018-11-01|5300.0|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|7000.0|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|5100.0|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|5800.0|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|4700.0|
|        015|          106|  Michael Lee| 37|  Male| 63000|2014-09-30|6300.0|
|        016|          107|  Kelly Zhang| 30|Female| 49000|2018-

In [25]:
# Create new column as bonus without UDF

emp.withColumn("bonus", expr("salary * 0.1")).show()

+-----------+-------------+-------------+---+------+------+----------+------+
|employee_id|department_id|         name|age|gender|salary| hire_date| bonus|
+-----------+-------------+-------------+---+------+------+----------+------+
|        017|          105|  George Wang| 34|  Male| 57000|2016-03-15|5700.0|
|        018|          104|    Nancy Liu| 29|Female| 50000|2017-06-01|5000.0|
|        019|          103|  Steven Chen| 36|  Male| 62000|2015-08-01|6200.0|
|        020|          102|    Grace Kim| 32|Female| 53000|2018-11-01|5300.0|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|7000.0|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|5100.0|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|5800.0|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|4700.0|
|        015|          106|  Michael Lee| 37|  Male| 63000|2014-09-30|6300.0|
|        016|          107|  Kelly Zhang| 30|Female| 49000|2018-

In [26]:
# Stop Spark Session

spark.stop()