In [None]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.appName('SparkUDF').getOrCreate()

columns = ["Seqno","Name"]
data=[("1", "sumit borhade"),
      ("2", "sahil kilje"),
      ("3", "jagraj singh"),
      ("4", "prashant dhavale"),
      ("5", "yash jadhav")]

df=spark.createDataFrame(data=data, schema=columns)

df.show(truncate=False)

+-----+----------------+
|Seqno|Name            |
+-----+----------------+
|1    |sumit borhade   |
|2    |sahil kilje     |
|3    |jagraj singh    |
|4    |prashant dhavale|
|5    |yash jadhav     |
+-----+----------------+



In [None]:
def convertCase(str):
  resStr=""
  arr=str.split(" ")
  for x in arr:
    resStr=resStr + x[0:1].upper() + x[1:len(x)] + " "
  return resStr

In [None]:
""" Converting function to UDF """
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, IntegerType, StringType

convertUDF = udf(lambda z: convertCase(z), StringType())

In [None]:
""" Converting function to UDF
StringType() is by default hence not required """

convertUDF = udf(lambda z: convertCase(z))

In [None]:
from pyspark.sql.functions import *

df.select(col("Seqno"), \
          convertUDF(col("Name")).alias("Name")) \
          .show(truncate=False)

+-----+-----------------+
|Seqno|Name             |
+-----+-----------------+
|1    |Sumit Borhade    |
|2    |Sahil Kilje      |
|3    |Jagraj Singh     |
|4    |Prashant Dhavale |
|5    |Yash Jadhav      |
+-----+-----------------+



In [None]:
def upperCase(str):
  return str.upper()

In [None]:
upperCaseUDF= udf(lambda z:upperCase(z),StringType())

df.withColumn("Cureated Name", upperCaseUDF(col("Name"))).show(truncate=False)

+-----+----------------+----------------+
|Seqno|Name            |Cureated Name   |
+-----+----------------+----------------+
|1    |sumit borhade   |SUMIT BORHADE   |
|2    |sahil kilje     |SAHIL KILJE     |
|3    |jagraj singh    |JAGRAJ SINGH    |
|4    |prashant dhavale|PRASHANT DHAVALE|
|5    |yash jadhav     |YASH JADHAV     |
+-----+----------------+----------------+



In [None]:
""" Using UDF on SQL """

spark.udf.register("convertUDF", convertCase, StringType())

df.createOrReplaceTempView("NAME_TABLE")

spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE").show(truncate=False)

+-----+-----------------+
|Seqno|Name             |
+-----+-----------------+
|1    |Sumit Borhade    |
|2    |Sahil Kilje      |
|3    |Jagraj Singh     |
|4    |Prashant Dhavale |
|5    |Yash Jadhav      |
+-----+-----------------+



In [None]:
spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE " + \
          "where Name is not null and convertUDF(Name) like '%Sahil%'").show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|2    |Sahil Kilje |
+-----+------------+



In [None]:
""" null check """

columns = ["Seqno","Name"]
data = [("1", "sumit borhade"),
    ("2", "sahil kilje"),
    ("3", "jagraj singh"),
    ("6",None)]

In [None]:
df2 = spark.createDataFrame(data=data,schema=columns)
df2.show(truncate=False)
df2.createOrReplaceTempView("NAME_TABLE2")

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |sumit borhade|
|2    |sahil kilje  |
|3    |jagraj singh |
|6    |null         |
+-----+-------------+



In [None]:
spark.udf.register("_nullsafeUDF", lambda str: convertCase(str) if not str is None else "" , StringType())

<function __main__.<lambda>(str)>

In [None]:
spark.sql("select _nullsafeUDF(Name) from NAME_TABLE2").show(truncate=False)

+------------------+
|_nullsafeUDF(Name)|
+------------------+
|Sumit Borhade     |
|Sahil Kilje       |
|Jagraj Singh      |
|                  |
+------------------+



In [None]:
spark.sql("select Seqno, _nullsafeUDF(Name) as Name from NAME_TABLE2 " + \
          " where Name is not null and _nullsafeUDF(Name) like '%Tra%'").show(truncate=False)

+-----+----+
|Seqno|Name|
+-----+----+
+-----+----+

