current_date



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date

# Create SparkSession
spark = SparkSession.builder \
    .appName("Current Date Example") \
    .getOrCreate()

# Get current date
current_date_df = spark.range(1).select(current_date().alias("current_date"))

# Show the current date
current_date_df.show()



+------------+
|current_date|
+------------+
|  2024-10-30|
+------------+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date

# Create SparkSession
spark = SparkSession.builder \
    .appName("to_date Example") \
    .getOrCreate()

# Sample DataFrame with date strings
data = [("2024-04-18",),
        ("2023-12-25",),
        ("2022-09-10",)]

df = spark.createDataFrame(data, ["date_string"])

# Apply to_date function to convert string to date
df = df.withColumn("date", to_date(df["date_string"]))

# Show the DataFrame
df.show()



+-----------+----------+
|date_string|      date|
+-----------+----------+
| 2024-04-18|2024-04-18|
| 2023-12-25|2023-12-25|
| 2022-09-10|2022-09-10|
+-----------+----------+



In [0]:
#In PySpark, the date_format() function is used to format a date or timestamp column according to a specified format string. Here's how you can use it:

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format

# Create SparkSession
spark = SparkSession.builder \
    .appName("date_format Example") \
    .getOrCreate()

# Sample DataFrame with a date column
data = [("2024-04-18",),
        ("2023-12-25",),
        ("2022-09-10",)]

df = spark.createDataFrame(data, ["date_string"])

# Convert the date string column to a date type
df = df.withColumn("date", to_date(df["date_string"]))

# Apply date_format to format the date
df = df.withColumn("formatted_date", date_format(df["date"], "yyyy/MM/dd"))

# Show the DataFrame
df.show()


+-----------+----------+--------------+
|date_string|      date|formatted_date|
+-----------+----------+--------------+
| 2024-04-18|2024-04-18|    2024/04/18|
| 2023-12-25|2023-12-25|    2023/12/25|
| 2022-09-10|2022-09-10|    2022/09/10|
+-----------+----------+--------------+



the datediff() function is used to compute the difference between two dates. It returns the number of days between two dates. 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import datediff, to_date

# Create SparkSession
spark = SparkSession.builder \
    .appName("datediff Example") \
    .getOrCreate()

# Sample DataFrame with two date columns
data = [("2024-04-18", "2024-04-10"),
        ("2023-12-25", "2023-11-25"),
        ("2022-09-10", "2022-09-01")]

df = spark.createDataFrame(data, ["date1_string", "date2_string"])

# Convert the date string columns to date type
df = df.withColumn("date1", to_date(df["date1_string"]))
df = df.withColumn("date2", to_date(df["date2_string"]))

# Apply datediff to calculate the difference in days
df = df.withColumn("date_difference", datediff(df["date1"], df["date2"]))

# Show the DataFrame
df.show()



+------------+------------+----------+----------+---------------+
|date1_string|date2_string|     date1|     date2|date_difference|
+------------+------------+----------+----------+---------------+
|  2024-04-18|  2024-04-10|2024-04-18|2024-04-10|              8|
|  2023-12-25|  2023-11-25|2023-12-25|2023-11-25|             30|
|  2022-09-10|  2022-09-01|2022-09-10|2022-09-01|              9|
+------------+------------+----------+----------+---------------+



months_between() in pyspark
the months_between() function is used to calculate the difference in months between two dates. It returns a float value representing the number of months between the two dates. Optionally, you can specify whether to include the fractional part in the result.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import months_between, to_date

# Create SparkSession
spark = SparkSession.builder \
    .appName("months_between Example") \
    .getOrCreate()

# Sample DataFrame with two date columns
data = [("2024-04-18", "2024-01-10"),
        ("2023-12-25", "2023-11-25"),
        ("2022-09-10", "2022-08-01")]

df = spark.createDataFrame(data, ["date1_string", "date2_string"])

# Convert the date string columns to date type
df = df.withColumn("date1", to_date(df["date1_string"]))
df = df.withColumn("date2", to_date(df["date2_string"]))

# Apply months_between to calculate the difference in months
df = df.withColumn("months_difference", months_between(df["date1"], df["date2"]))

# Show the DataFrame
df.show()



+------------+------------+----------+----------+-----------------+
|date1_string|date2_string|     date1|     date2|months_difference|
+------------+------------+----------+----------+-----------------+
|  2024-04-18|  2024-01-10|2024-04-18|2024-01-10|       3.25806452|
|  2023-12-25|  2023-11-25|2023-12-25|2023-11-25|              1.0|
|  2022-09-10|  2022-08-01|2022-09-10|2022-08-01|       1.29032258|
+------------+------------+----------+----------+-----------------+



date_add() in pyspark

the date_add() function is used to add or subtract a specified number of days to a date column. It returns a new date by adding the specified number of days to the input date. 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_add, to_date

# Create SparkSession
spark = SparkSession.builder \
    .appName("date_add Example") \
    .getOrCreate()

# Sample DataFrame with a date column
data = [("2024-04-18",),
        ("2023-12-25",),
        ("2022-09-10",)]

df = spark.createDataFrame(data, ["date_string"])

# Convert the date string column to a date type
df = df.withColumn("date", to_date(df["date_string"]))

# Apply date_add to add 5 days to the date
df = df.withColumn("date_added", date_add(df["date"], 5))

# Show the DataFrame
df.show()




+-----------+----------+----------+
|date_string|      date|date_added|
+-----------+----------+----------+
| 2024-04-18|2024-04-18|2024-04-23|
| 2023-12-25|2023-12-25|2023-12-30|
| 2022-09-10|2022-09-10|2022-09-15|
+-----------+----------+----------+



the month() function is used to extract the month component from a date or timestamp column. It returns an integer representing the month (1 for January, 2 for February, and so on). 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, to_date

# Create SparkSession
spark = SparkSession.builder \
    .appName("month Example") \
    .getOrCreate()

# Sample DataFrame with a date column
data = [("2024-04-18",),
        ("2023-12-25",),
        ("2022-09-10",)]

df = spark.createDataFrame(data, ["date_string"])

# Convert the date string column to a date type
df = df.withColumn("date", to_date(df["date_string"]))

# Apply month function to extract the month
df = df.withColumn("month", month(df["date"]))

# Show the DataFrame
df.show()



+-----------+----------+-----+
|date_string|      date|month|
+-----------+----------+-----+
| 2024-04-18|2024-04-18|    4|
| 2023-12-25|2023-12-25|   12|
| 2022-09-10|2022-09-10|    9|
+-----------+----------+-----+



year() in pyspark

the year() function is used to extract the year component from a date or timestamp column. It returns an integer representing the year. 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, to_date

# Create SparkSession
spark = SparkSession.builder \
    .appName("year Example") \
    .getOrCreate()

# Sample DataFrame with a date column
data = [("2024-04-18",),
        ("2023-12-25",),
        ("2022-09-10",)]

df = spark.createDataFrame(data, ["date_string"])

# Convert the date string column to a date type
df = df.withColumn("date", to_date(df["date_string"]))

# Apply year function to extract the year
df = df.withColumn("year", year(df["date"]))

# Show the DataFrame
df.show()



+-----------+----------+----+
|date_string|      date|year|
+-----------+----------+----+
| 2024-04-18|2024-04-18|2024|
| 2023-12-25|2023-12-25|2023|
| 2022-09-10|2022-09-10|2022|
+-----------+----------+----+



the current_timestamp() function is used to retrieve the current timestamp. It returns the current timestamp as a timestamp type.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp

# Create SparkSession
spark = SparkSession.builder \
    .appName("current_timestamp Example") \
    .getOrCreate()

# Create a DataFrame with current timestamp
df = spark.range(1).select(current_timestamp().alias("current_timestamp"))

# Show the DataFrame
df.show(truncate=False)



+-----------------------+
|current_timestamp      |
+-----------------------+
|2024-10-30 10:11:00.222|
+-----------------------+



The to_timestamp() function is used to convert a string column or an expression representing a timestamp string into a timestamp type. 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp

# Create SparkSession
spark = SparkSession.builder \
    .appName("to_timestamp Example") \
    .getOrCreate()

# Sample DataFrame with timestamp strings
data = [("2024-04-18 12:34:56",),
        ("2023-12-25 08:30:45",),
        ("2022-09-10 15:20:10",)]

df = spark.createDataFrame(data, ["timestamp_string"])

# Apply to_timestamp function to convert string to timestamp
df = df.withColumn("timestamp", to_timestamp(df["timestamp_string"]))

# Show the DataFrame
df.show(truncate=False)



+-------------------+-------------------+
|timestamp_string   |timestamp          |
+-------------------+-------------------+
|2024-04-18 12:34:56|2024-04-18 12:34:56|
|2023-12-25 08:30:45|2023-12-25 08:30:45|
|2022-09-10 15:20:10|2022-09-10 15:20:10|
+-------------------+-------------------+



In PySpark, the hour() function is used to extract the hour component from a timestamp column. It returns an integer representing the hour of the day (0 to 23). 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, to_timestamp

# Create SparkSession
spark = SparkSession.builder \
    .appName("hour Example") \
    .getOrCreate()

# Sample DataFrame with a timestamp column
data = [("2024-04-18 12:34:56",),
        ("2023-12-25 08:30:45",),
        ("2022-09-10 15:20:10",)]

df = spark.createDataFrame(data, ["timestamp_string"])

# Convert the timestamp string column to a timestamp type
df = df.withColumn("timestamp", to_timestamp(df["timestamp_string"]))

# Apply hour function to extract the hour
df = df.withColumn("hour", hour(df["timestamp"]))

# Show the DataFrame
df.show(truncate=False)


+-------------------+-------------------+----+
|timestamp_string   |timestamp          |hour|
+-------------------+-------------------+----+
|2024-04-18 12:34:56|2024-04-18 12:34:56|12  |
|2023-12-25 08:30:45|2023-12-25 08:30:45|8   |
|2022-09-10 15:20:10|2022-09-10 15:20:10|15  |
+-------------------+-------------------+----+



In PySpark, the minute() function is used to extract the minute component from a timestamp column. It returns an integer representing the minute of the hour (0 to 59). 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import minute, to_timestamp

# Create SparkSession
spark = SparkSession.builder \
    .appName("minute Example") \
    .getOrCreate()

# Sample DataFrame with a timestamp column
data = [("2024-04-18 12:34:56",),
        ("2023-12-25 08:30:45",),
        ("2022-09-10 15:20:10",)]

df = spark.createDataFrame(data, ["timestamp_string"])

# Convert the timestamp string column to a timestamp type
df = df.withColumn("timestamp", to_timestamp(df["timestamp_string"]))

# Apply minute function to extract the minute
df = df.withColumn("minute", minute(df["timestamp"]))

# Show the DataFrame
df.show(truncate=False)




+-------------------+-------------------+------+
|timestamp_string   |timestamp          |minute|
+-------------------+-------------------+------+
|2024-04-18 12:34:56|2024-04-18 12:34:56|34    |
|2023-12-25 08:30:45|2023-12-25 08:30:45|30    |
|2022-09-10 15:20:10|2022-09-10 15:20:10|20    |
+-------------------+-------------------+------+



In PySpark, there isn't a built-in function called seconds() to directly extract the seconds component from a timestamp column. However, you can achieve this by using a combination of other functions.

You can use the second() function along with the to_timestamp() function to extract the seconds component from a timestamp column. 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import second, to_timestamp

# Create SparkSession
spark = SparkSession.builder \
    .appName("seconds Example") \
    .getOrCreate()

# Sample DataFrame with a timestamp column
data = [("2024-04-18 12:34:56",),
        ("2023-12-25 08:30:45",),
        ("2022-09-10 15:20:10",)]

df = spark.createDataFrame(data, ["timestamp_string"])

# Convert the timestamp string column to a timestamp type
df = df.withColumn("timestamp", to_timestamp(df["timestamp_string"]))

# Apply second function to extract the seconds
df = df.withColumn("seconds", second(df["timestamp"]))

# Show the DataFrame
df.show(truncate=False)




+-------------------+-------------------+-------+
|timestamp_string   |timestamp          |seconds|
+-------------------+-------------------+-------+
|2024-04-18 12:34:56|2024-04-18 12:34:56|56     |
|2023-12-25 08:30:45|2023-12-25 08:30:45|45     |
|2022-09-10 15:20:10|2022-09-10 15:20:10|10     |
+-------------------+-------------------+-------+

