# Date and Time Functions

In [1]:
from pyspark.sql.functions import to_date

In [3]:
from pyspark.sql import SparkSession

# Initialize SparkSession if not already initialized
spark = SparkSession.builder.appName("DummyDataFrame").getOrCreate()

In [4]:
# Create dummy data for demonstration
dummy_data = [
    ("2023-01-01",),
    ("2023-01-15",),
    ("2023-02-28",),
    ("2023-03-05",)
]

# Define schema for the dummy DataFrame
df = spark.createDataFrame(dummy_data, ["date_str"])

In [5]:
df = df.withColumn("date_parsed1", to_date("date_str", "yyyy-MM-dd"))

In [6]:
df.show()

+----------+------------+
|  date_str|date_parsed1|
+----------+------------+
|2023-01-01|  2023-01-01|
|2023-01-15|  2023-01-15|
|2023-02-28|  2023-02-28|
|2023-03-05|  2023-03-05|
+----------+------------+



In [8]:
df = df.withColumn("date_parsed2", to_date("date_str", "dd-MMM-yyyy"))
df.show()

+----------+------------+------------+
|  date_str|date_parsed1|date_parsed2|
+----------+------------+------------+
|2023-01-01|  2023-01-01|        NULL|
|2023-01-15|  2023-01-15|        NULL|
|2023-02-28|  2023-02-28|        NULL|
|2023-03-05|  2023-03-05|        NULL|
+----------+------------+------------+



In [9]:
df = df.withColumn("date_parsed3", to_date("date_str", "MM/dd/yyyy"))
df.show()

+----------+------------+------------+------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|
+----------+------------+------------+------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|
|2023-01-15|  2023-01-15|        NULL|        NULL|
|2023-02-28|  2023-02-28|        NULL|        NULL|
|2023-03-05|  2023-03-05|        NULL|        NULL|
+----------+------------+------------+------------+



In [11]:
df = df.withColumn("date_parsed4", to_date("date_str", "yyyy.MM.dd"))
df.show()

+----------+------------+------------+------------+------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|
+----------+------------+------------+------------+------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|
|2023-03-05|  2023-03-05|        NULL|        NULL|        NULL|
+----------+------------+------------+------------+------------+



# string to timestamp format

In [12]:
from pyspark.sql.functions import to_timestamp

In [15]:
df = df.withColumn("timestamp_parsed1", to_timestamp("date_str", "yyyy-MM-dd"))
df.show()

+----------+------------+------------+------------+------------+-------------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|
+----------+------------+------------+------------+------------+-------------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|
|2023-03-05|  2023-03-05|        NULL|        NULL|        NULL|2023-03-05 00:00:00|
+----------+------------+------------+------------+------------+-------------------+



In [17]:
df = df.withColumn("timestamp_parsed2", to_timestamp("date_str", "yyyy-MM-dd"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|
+----------+------------+------------+------------+------------+-------------------+-------------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|
|2023-03-05|  2023-03-05|        NULL|        NULL|        NULL|2023-03-05 00:00:00|2023-03-05 00:00:00|
+----------+------------+------------+------------+------------+-------------------+-------------------+



# Date Functions

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (current_date, date_add, date_sub, datediff, add_months, trunc, date_format, year, month, dayofmonth, next_day, last_day)

In [19]:
df = df.withColumn("current_date", current_date())
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|  2025-12-01|
|2023-03-05|  2023-03-05|        NULL|        NULL|        NULL|2023-03-05 00:00:00|2023-03-05 00:00:00|  2025-12-01|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+



In [21]:
df = df.withColumn("date_plus_10", date_add("date_parsed1", 10))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|  2025-12-01|  2023-03-10|
|2023-03-05|  2023-03-05|        NULL|        NULL|        NULL|2023-03-05 00:00:00|2023-03-05 00:00:00|  2025-12-01|  2023-03-15|
+----------+------------+------------+------------+------------+-------------------

In [23]:
df = df.withColumn("date_minus_5", date_sub("date_parsed1", 5))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|  2025-12-01|  2023-03-10|  2023-02-23|
|2023-03-05|  2023-03-05|        NULL|        NULL|        NULL|2023-03-05 00:00:00|2023-03-05 00:00:00|  2025-12-01|  2023-03-15|  2023

In [25]:
df = df.withColumn("days_diff", datediff(current_date(), "date_parsed1"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|     1051|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|  2025-12-01|  2023-03-10|  2023-02-23|     1007|
|2023-03-05|  2023-03-05|        NULL|        NULL|        NULL|2023-03-05 0

In [27]:
df = df.withColumn("add_months", add_months("date_parsed1", 2))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|     1051|2023-03-15|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|  2025-12-01|  2023-03-10|  2023-02-23|     1007|2023-04-28|
|2023-03-0

In [29]:
df = df.withColumn("year", year("date_parsed1"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|     1051|2023-03-15|2023|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|  2025-12-01|  2023-03-10|  2023-02-23|     10

In [31]:
df = df.withColumn("month", month("date_parsed1"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|month|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|    1|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|     1051|2023-03-15|2023|    1|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00:00|  2025-12-01|  2

In [33]:
df = df.withColumn("day", dayofmonth("date_parsed1"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|month|day|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|    1|  1|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|     1051|2023-03-15|2023|    1| 15|
|2023-02-28|  2023-02-28|        NULL|        NULL|        NULL|2023-02-28 00:00:00|2023-02-28 00:00

In [35]:
from pyspark.sql.functions import dayofweek
df = df.withColumn("day_of_week", dayofweek("date_parsed1"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|month|day|day_of_week|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|    1|  1|          1|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|     1051|2023-03-15|2023|    1| 15|          1|
|2023-02-28|  2023-02-28|        NULL|  

In [37]:
from pyspark.sql.functions import weekofyear
df = df.withColumn("week_of_year", weekofyear("date_parsed1"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|month|day|day_of_week|week_of_year|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|    1|  1|          1|          52|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  2023-01-10|     1051|2023-03-15|2023|    1| 15| 

In [39]:
df = df.withColumn("trunc_month", trunc("date_parsed1", "MM"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+-----------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|month|day|day_of_week|week_of_year|trunc_month|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+-----------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|    1|  1|          1|          52| 2023-01-01|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|2023-01-15 00:00:00|  2025-12-01|  2023-01-25|  

In [41]:
df = df.withColumn("next_monday", next_day("date_parsed1", "Monday"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+-----------+-----------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|month|day|day_of_week|week_of_year|trunc_month|next_monday|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+-----------+-----------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|    1|  1|          1|          52| 2023-01-01| 2023-01-02|
|2023-01-15|  2023-01-15|        NULL|        NULL|        NULL|2023-01-15 00:00:00|

In [43]:
df = df.withColumn("last_day_month", last_day("date_parsed1"))
df.show()

+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+-----------+-----------+--------------+
|  date_str|date_parsed1|date_parsed2|date_parsed3|date_parsed4|  timestamp_parsed1|  timestamp_parsed2|current_date|date_plus_10|date_minus_5|days_diff|add_months|year|month|day|day_of_week|week_of_year|trunc_month|next_monday|last_day_month|
+----------+------------+------------+------------+------------+-------------------+-------------------+------------+------------+------------+---------+----------+----+-----+---+-----------+------------+-----------+-----------+--------------+
|2023-01-01|  2023-01-01|        NULL|        NULL|        NULL|2023-01-01 00:00:00|2023-01-01 00:00:00|  2025-12-01|  2023-01-11|  2022-12-27|     1065|2023-03-01|2023|    1|  1|          1|          52| 2023-01-01| 2023-01-02|    2023-01-31|
|2023-01-15|  2023-01-15