In [None]:
import findspark
findspark.init()
findspark.find()

import pyspark
from pyspark.sql import SparkSession,Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder.appName("Data and Time Functions").master("local[3]").getOrCreate()

In [None]:
#-------------------------------------------------------------------------

emp = [
    (1, "AAA", "dept1", 1000, "2019-02-01 15:12:13"),
    (2, "BBB", "dept1", 1100, "2018-04-01 5:12:3"),
    (3, "CCC", "dept1", 3000, "2017-06-05 1:2:13"),
    (4, "DDD", "dept1", 1500, "2019-08-10 10:52:53"),
    (5, "EEE", "dept2", 8000, "2016-01-11 5:52:43"),
    (6, "FFF", "dept2", 7200, "2015-04-14 19:32:33"),
    (7, "GGG", "dept3", 7100, "2019-02-21 15:42:43"),
    (8, "HHH", "dept3", 3700, "2016-09-25 15:32:33"),
    (9, "III", "dept3", 4500, "2017-10-15 15:22:23"),
    (10, "JJJ", "dept5", 3400, "2018-12-17 15:14:17")
]

empdf = spark.createDataFrame(emp, ["id", "name", "dept", "salary", "date"])
empdf.printSchema()
empdf.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# add_months()

df = empdf.withColumn("next_month", add_months("date", 1))
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# current_date()

df = empdf.withColumn("current_date", current_date()).select("id", "current_date")
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# current_timestamp()

df = empdf.withColumn("current_timestamp", current_timestamp()).select("id", "current_timestamp")
df.printSchema()
df.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# date_add()

df = empdf.select("date").withColumn("next_date", date_add("date", 5))
df.printSchema()
df.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# date_format()

"""
This function converts the date to a specified format
"""

df = (empdf
      .select("date")
      .withColumn("formatted_date", date_format("date", "dd/MM/yyyy"))
      .withColumn("another_formatted_date", date_format("date", "yyyy-MM-dd"))
     )
df.printSchema()
df.show(truncate=False)
#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# date_sub()

df = df.select("date").withColumn("previous_date", date_sub("date", 5))
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# date_trunc()

"""
This function returns a timestamp truncated to a specified unit.
It could be year, month, day, hour, minute, second, week or quarter.
1. yyyy or yy or year
2. mm or month or mon
3. day or dd
4. hour
5. minute
6. second
7. week
8. quarter
"""

# truncate date by year: we can use "yyyy" or "yy" or "year" to specify year

df = empdf.select("date").withColumn("new_timestamp", date_trunc("yyyy", "date"))
df.printSchema()
df.show()

print("=====================================================")

# truncate date by month: we can use "mm" or "month" or "mon" to specify month

df = empdf.select("date").withColumn("new_timestamp", date_trunc("mm", "date"))
df.printSchema()
df.show()

print("=====================================================")

# truncate date by day: we can use "day" or "dd" to specify day

df = empdf.select("date").withColumn("new_timestamp", date_trunc("day", "date"))
df.printSchema()
df.show()

print("=====================================================")

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# datediff()

"""
This function returns the difference between dates in terms of days
"""

df = empdf.select("date").withColumn("current_date", current_date()).withColumn("date_diff", datediff("current_date", "date"))
df.printSchema()
df.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# dayofmonth()

"""
This function returns the day of month
"""

df = empdf.select("date").withColumn("day_of_month", dayofmonth("date"))
df.printSchema()
df.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# dayofweek()

"""
This function returns the day of the week as an integer.
It will consider Sunday as 1st and Saturday as 7th
"""

df = empdf.select("date", dayofweek("date").alias("day_of_week"))
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# dayofyear()

"""
This function returns day of the year as an integer
"""

df = empdf.select("date").withColumn("day_of_year", dayofyear("date"))
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# from_utc_timestamp()

"""
This function converts UTC timestamps to timestamps of any specified timezone.
By default, it assumes the date is a UTC timestamp
"""

df = empdf.select("date").withColumn("ist_timestamp", from_utc_timestamp("date", "IST"))
df.printSchema()
df.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# unix_timestamp()

"""
This function converts timestamp strings of the given format to Unix timestamps(in seconds).
Default format: "yyyy-MM-dd HH:mm:ss"
"""

df = empdf.select("date").withColumn("unix_timestamp", unix_timestamp("date", "yyyy-MM-dd HH:mm:ss"))
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# from_unixtime()

"""
This function converts the number of seconds from Unix epoch(1970-01-01 00:00:00 UTC)
to a given string format.
"""

df = empdf.select("date", unix_timestamp("date").alias("unix_timestamp"))
df = df.withColumn("from_unix_time", from_unixtime("unix_timestamp"))
df.printSchema()
df.show(truncate=False)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# hour()

"""This function returns hour part of the date"""

empdf.select("date", hour("date").alias("hour")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# last_day()

"""This function returns the last date of the month for a given date"""

empdf.select("date", last_day("date").alias("last_date_of_month")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# minute()

"""This function returns minute part of date"""

empdf.select("date", minute("date").alias("date")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# month()

"""This function returns month part of date"""

empdf.select("date", month("date").alias("month")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# months_between()

"""
This function returns the difference between dates in terms of months.
if first_date > second_date then result is positive else negative.
"""

df = empdf.select("date").withColumn("current_date", current_date())
df = df.withColumn("months_between", months_between("current_date", "date"))
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# next_day()

"""
This function returns the next day based on the dayOfWeek specified in the next argument. 
For e.g. for 1st Feb 2019 (Friday) if we ask for next_day as Sunday, it will return 3rd Feb 2019.
"""

df = empdf.select("date").withColumn("next_day", next_day("date", "sun"))
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# quarter()

"""This function returns the quarter of given date as integer"""

empdf.select("date", quarter("date").alias("quarter_of_date")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# second()

"""This function returns second part of the date"""

empdf.select("date", second("date").alias("second_part_of_date")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# to_date()

"""This function converts string or timestamp to date"""

df = empdf.select("date").withColumn("to_date", to_date("date", format='yyyy-MM-dd HH:mm:ss'))
df.printSchema()
df.show()

#-------------------------------------------------------------------------


df1 = spark.createDataFrame([('15/02/2019 10:30:00',)], ['date'])
df2 = df1.withColumn("new_date", to_date("date", 'dd/MM/yyyy HH:mm:ss'))
df2.printSchema()
df2.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# to_timestamp()

"""
This function converts String to Timestamp.
Default format: yyyy-MM-dd HH:mm:ss
"""

df1 = spark.createDataFrame([('15/02/2019 10:30:00',)], ['date'])
df2 = df1.withColumn("new_date", to_timestamp("date", 'dd/MM/yyyy HH:mm:ss'))
df2.printSchema()
df2.show(2)

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# to_utc_timestamp

"""This function converts given timestamp to UTC timestamp"""

df = empdf.select("date").withColumn("utc_timestamp", to_utc_timestamp("date", "IST")) # convert IST to UTC
df.printSchema()
df.show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# weekofyear()

"""This function returns week of year for the given date"""

empdf.select("date", weekofyear("date").alias("week_number")).show()

#-------------------------------------------------------------------------

In [None]:
#-------------------------------------------------------------------------

# year()

"""This function returns year part of the date"""

empdf.select("date", year("date").alias("year")).show()

#-------------------------------------------------------------------------