In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName('UDF').master('local[*]').getOrCreate()

In [3]:
emp_df = spark.read.csv('file:///home/jovyan/work/HR-Dataset/core_dataset.csv',header=True,inferSchema=True)

### User Defined Function

In [71]:
emp_df.withColumn('first_name', split(col('Employee Name'),', ').getItem(0)).select('first_name').show(5)

+----------+
|first_name|
+----------+
|     Brown|
| LaRotonda|
|    Steans|
|    Howard|
|     Singh|
+----------+
only showing top 5 rows



In [12]:
spark.sql('''
select * from (
    select CAST(1000 AS STRING) a
    union 
    select CAST('0999' AS STRING) 
    union 
    select CAST('0200' AS STRING) 
    union 
    select CAST('0099' AS STRING) 
)
order by a
''').show()

+----+
|   a|
+----+
|0099|
|0200|
|0999|
|1000|
+----+



In [5]:
@udf(returnType=StringType())
def pad_zeros(val, n):
    return str(val).rjust(n, '0')

In [6]:
emp_df.withColumn('new_zip', pad_zeros(col('Zip'), lit(7))).select('Zip','new_zip').show(5)

+------+-------+
|   Zip|new_zip|
+------+-------+
|1450.0|01450.0|
|1460.0|01460.0|
|2703.0|02703.0|
|2170.0|02170.0|
|2330.0|02330.0|
+------+-------+
only showing top 5 rows



In [7]:
emp_df.createOrReplaceTempView('emp_tbl')

In [8]:
spark.udf.register('pad_zeroes',pad_zeros)

<pyspark.sql.udf.UserDefinedFunction at 0x7f212ca80ad0>

In [9]:
spark.sql('''
select pad_zeroes(Age,5), pad_zeroes(Age,10) from emp_tbl
''').show()

+------------------+-------------------+
|pad_zeroes(Age, 5)|pad_zeroes(Age, 10)|
+------------------+-------------------+
|             00032|         0000000032|
|             00033|         0000000033|
|             00031|         0000000031|
|             00032|         0000000032|
|             00029|         0000000029|
|             00030|         0000000030|
|             00033|         0000000033|
|             00033|         0000000033|
|             00030|         0000000030|
|             00038|         0000000038|
|             00063|         0000000063|
|             00038|         0000000038|
|             00031|         0000000031|
|             00046|         0000000046|
|             00031|         0000000031|
|             00033|         0000000033|
|             00031|         0000000031|
|             00029|         0000000029|
|             00033|         0000000033|
|             00033|         0000000033|
+------------------+-------------------+
only showing top

In [42]:
# spark.stop()

In [60]:
'99'.rjust(10,'0')

'0000000099'