In [20]:
# creating spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# create a dataframe using a list of rows
from datetime import datetime, date
import pandas as pd # type: ignore
from pyspark.sql import Row
from pyspark.sql.types import StringType, IntegerType
# need for the udf defination.
from pyspark.sql.functions import udf, pandas_udf, col

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [12]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  4|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [15]:
#udf defination 
@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 1


df.withColumn("a", pandas_plus_one(df.a)).alias('a').show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  2|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  3|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  5|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [16]:


replace_hello_with_bonjour = udf(lambda x: x.replace("string", "Udf_string"), StringType())

df = df.withColumn("c", replace_hello_with_bonjour(df.c))

df.show()

+---+---+-----------+----------+-------------------+
|  a|  b|          c|         d|                  e|
+---+---+-----------+----------+-------------------+
|  1|2.0|Udf_string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|Udf_string2|2000-02-01|2000-01-02 12:00:00|
|  4|5.0|Udf_string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-----------+----------+-------------------+



In [19]:

# Create a SparkSession
spark = SparkSession.builder.appName("UDF Example").getOrCreate()

# Create a DataFrame
data = [
    (1,),
    (2,),
    (3,)
]
df = spark.createDataFrame(data, ["column_name"])

# Define and register the UDF
def double_value(value):
    return value * 2

double_value_udf = udf(double_value, IntegerType())
spark.udf.register("double_value", double_value_udf)

# Use the UDF in a SQL query
result_df = df.select(double_value_udf(col("column_name")).alias("doubled_value"))

# Show the result
result_df.show()

+-------------+
|doubled_value|
+-------------+
|            2|
|            4|
|            6|
+-------------+

