In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create Spark Session
spark = SparkSession.builder.appName("Spark SQL Functions").getOrCreate()


In [6]:
from datetime import datetime

data = [
    ("alice", "  Hello World  ", "2024-04-01", "2024-04-01 12:30:00"),
    ("bob", "Spark SQL Functions", "2023-12-25", "2023-12-25 09:15:00"),
    ("charlie", "Data Processing", "2025-01-01", "2025-01-01 23:59:59")
]

columns = ["name", "text", "date_str", "timestamp_str"]

df = spark.createDataFrame(data, columns)

# Convert strings to proper Date and Timestamp
df = df.withColumn("date", to_date("date_str", "yyyy-MM-dd")) \
       .withColumn("timestamp", to_timestamp("timestamp_str", "yyyy-MM-dd HH:mm:ss"))

df.show(truncate=False)


+-------+-------------------+----------+-------------------+----------+-------------------+
|name   |text               |date_str  |timestamp_str      |date      |timestamp          |
+-------+-------------------+----------+-------------------+----------+-------------------+
|alice  |  Hello World      |2024-04-01|2024-04-01 12:30:00|2024-04-01|2024-04-01 12:30:00|
|bob    |Spark SQL Functions|2023-12-25|2023-12-25 09:15:00|2023-12-25|2023-12-25 09:15:00|
|charlie|Data Processing    |2025-01-01|2025-01-01 23:59:59|2025-01-01|2025-01-01 23:59:59|
+-------+-------------------+----------+-------------------+----------+-------------------+



In [7]:
df_string = df.select(
    col("name"),
    col("text"),
    length("text").alias("length"),
    trim("text").alias("trimmed"),
    ltrim("text").alias("ltrimmed"),
    rtrim("text").alias("rtrimmed"),
    lower("text").alias("lowercase"),
    upper("text").alias("uppercase"),
    concat_ws(" - ", col("name"), col("text")).alias("concatenated"),
    instr("text", "SQL").alias("position_SQL"),
    locate("Data", "text").alias("position_Data"),
    substring("text", 1, 5).alias("substring_1_5"),
    regexp_replace("text", " ", "_").alias("replace_spaces"),
    reverse("text").alias("reversed")
)

df_string.show(truncate=False)


+-------+-------------------+------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------------+------------+-------------+-------------+-------------------+-------------------+
|name   |text               |length|trimmed            |ltrimmed           |rtrimmed           |lowercase          |uppercase          |concatenated             |position_SQL|position_Data|substring_1_5|replace_spaces     |reversed           |
+-------+-------------------+------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------------+------------+-------------+-------------+-------------------+-------------------+
|alice  |  Hello World      |15    |Hello World        |Hello World        |  Hello World      |  hello world      |  HELLO WORLD      |alice -   Hello World    |0           |0            |  Hel        |__Hello_World__    |  dlroW olleH      |
|bob    |Spark SQL Funct

In [8]:
df_date = df.select(
    "name",
    "date",
    "timestamp",
    year("date").alias("year"),
    month("date").alias("month"),
    dayofmonth("date").alias("day"),
    dayofweek("date").alias("weekday"),
    dayofyear("date").alias("day_of_year"),
    weekofyear("date").alias("week"),
    current_date().alias("current_date"),
    datediff(current_date(), "date").alias("days_diff"),
    add_months("date", 2).alias("plus_2_months"),
    date_add("date", 7).alias("plus_7_days"),
    date_sub("date", 5).alias("minus_5_days"),
    hour("timestamp").alias("hour"),
    minute("timestamp").alias("minute"),
    second("timestamp").alias("second"),
    current_timestamp().alias("now")
)

df_date.show(truncate=False)


+-------+----------+-------------------+----+-----+---+-------+-----------+----+------------+---------+-------------+-----------+------------+----+------+------+--------------------------+
|name   |date      |timestamp          |year|month|day|weekday|day_of_year|week|current_date|days_diff|plus_2_months|plus_7_days|minus_5_days|hour|minute|second|now                       |
+-------+----------+-------------------+----+-----+---+-------+-----------+----+------------+---------+-------------+-----------+------------+----+------+------+--------------------------+
|alice  |2024-04-01|2024-04-01 12:30:00|2024|4    |1  |2      |92         |14  |2025-04-14  |378      |2024-06-01   |2024-04-08 |2024-03-27  |12  |30    |0     |2025-04-14 08:30:57.500074|
|bob    |2023-12-25|2023-12-25 09:15:00|2023|12   |25 |2      |359        |52  |2025-04-14  |476      |2024-02-25   |2024-01-01 |2023-12-20  |9   |15    |0     |2025-04-14 08:30:57.500074|
|charlie|2025-01-01|2025-01-01 23:59:59|2025|1    |1  |