In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Processing Column Data'). \
    master('yarn'). \
    getOrCreate()

In [2]:
from pyspark.sql.functions import *

In [3]:
# Creating a dummy dataframe for checking functions
l = [("X", )]

df = spark.createDataFrame(l, "dummy STRING")

In [4]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [5]:
df.select(current_date()).show() #yyyy-MM-dd

+--------------+
|current_date()|
+--------------+
|    2021-06-22|
+--------------+



In [6]:
df.select(current_timestamp()).show(truncate=False) #yyyy-MM-dd HH:mm:ss.SSS

+-----------------------+
|current_timestamp()    |
+-----------------------+
|2021-06-22 11:32:28.059|
+-----------------------+



In [7]:
df.select(to_date(lit('20210228'), 'yyyyMMdd').alias('to_date')).show()

+----------+
|   to_date|
+----------+
|2021-02-28|
+----------+



In [8]:
df.select(to_timestamp(lit('20210228 1725'), 'yyyyMMdd HHmm').alias('to_timestamp')).show()

+-------------------+
|       to_timestamp|
+-------------------+
|2021-02-28 17:25:00|
+-------------------+



##### convert non standard dates and timestamps to standard dates and timestamps

* `yyyy-MM-dd` is the standard date format

* `yyyy-MM-dd HH:mm:ss.SSS` is the standard timestamp format

In [9]:
datetimes = [(20140228, "28-Feb-2014 10:00:00.123"),
                     (20160229, "20-Feb-2016 08:08:08.999"),
                     (20171031, "31-Dec-2017 11:59:59.123"),
                     (20191130, "31-Aug-2019 00:00:00.000")
                ]

In [10]:
datetimesDF = spark.createDataFrame(datetimes, schema="date BIGINT, time STRING")

In [11]:
datetimesDF.show(truncate=False)

+--------+------------------------+
|date    |time                    |
+--------+------------------------+
|20140228|28-Feb-2014 10:00:00.123|
|20160229|20-Feb-2016 08:08:08.999|
|20171031|31-Dec-2017 11:59:59.123|
|20191130|31-Aug-2019 00:00:00.000|
+--------+------------------------+



In [12]:
df.select(to_date(lit("20140228"), "yyyyMMdd").alias("to_date")).show(truncate=False)

+----------+
|to_date   |
+----------+
|2014-02-28|
+----------+



In [13]:
df.select(to_date(lit('March 2, 2021'), 'MMMM d, yyyy').alias('to_date')).show()

+----------+
|   to_date|
+----------+
|2021-03-02|
+----------+



In [14]:
df.select(to_date(lit('March 21, 2021'), 'MMMM dd, yyyy').alias('to_date')).show()

+----------+
|   to_date|
+----------+
|2021-03-21|
+----------+



In [15]:
datetimesDF.printSchema()

root
 |-- date: long (nullable = true)
 |-- time: string (nullable = true)



In [16]:
# This will fail as to_date() function expects the input column in string format, not in long format

datetimesDF. \
    withColumn('to_date', to_date(col('date'), 'yyyyMMdd')). \
    withColumn('to_timestamp', to_timestamp(col('time'), 'dd-MMM-yyyy HH:mm:ss.SSS')). \
    show(truncate=False)

# notice the error: due to data type mismatch: argument 1 requires (string or date

AnalysisException: "cannot resolve 'unix_timestamp(`date`, 'yyyyMMdd')' due to data type mismatch: argument 1 requires (string or date or timestamp) type, however, '`date`' is of bigint type.;;\n'Project [date#30L, time#31, to_date('date, Some(yyyyMMdd)) AS to_date#59]\n+- LogicalRDD [date#30L, time#31], false\n"

In [17]:
# type cast the long to string
datetimesDF. \
    withColumn('to_date', to_date(col('date').cast('string'), 'yyyyMMdd')). \
    withColumn('to_timestamp', to_timestamp(col('time'), 'dd-MMM-yyyy HH:mm:ss.SSS')). \
    show(truncate=False)

+--------+------------------------+----------+-------------------+
|date    |time                    |to_date   |to_timestamp       |
+--------+------------------------+----------+-------------------+
|20140228|28-Feb-2014 10:00:00.123|2014-02-28|2014-02-28 10:00:00|
|20160229|20-Feb-2016 08:08:08.999|2016-02-29|2016-02-20 08:08:08|
|20171031|31-Dec-2017 11:59:59.123|2017-10-31|2017-12-31 11:59:59|
|20191130|31-Aug-2019 00:00:00.000|2019-11-30|2019-08-31 00:00:00|
+--------+------------------------+----------+-------------------+



In [18]:
datetimes2 = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [19]:
datetimesDF2 = spark.createDataFrame(datetimes2, schema="date STRING, time STRING")

In [20]:
datetimesDF2.show(truncate=False)

+----------+-----------------------+
|date      |time                   |
+----------+-----------------------+
|2014-02-28|2014-02-28 10:00:00.123|
|2016-02-29|2016-02-29 08:08:08.999|
|2017-10-31|2017-12-31 11:59:59.123|
|2019-11-30|2019-08-31 00:00:00.000|
+----------+-----------------------+



In [21]:
datetimesDF2. \
    withColumn("date_ym", date_format("date", "yyyyMM")). \
    withColumn("time_ym", date_format("time", "yyyyMM")). \
    show(truncate=False)

+----------+-----------------------+-------+-------+
|date      |time                   |date_ym|time_ym|
+----------+-----------------------+-------+-------+
|2014-02-28|2014-02-28 10:00:00.123|201402 |201402 |
|2016-02-29|2016-02-29 08:08:08.999|201602 |201602 |
|2017-10-31|2017-12-31 11:59:59.123|201710 |201712 |
|2019-11-30|2019-08-31 00:00:00.000|201911 |201908 |
+----------+-----------------------+-------+-------+



In [22]:
datetimesDF2. \
    withColumn("date_ym", date_format("date", "yyyyMMdd").cast('int')). \
    withColumn("time_ym", date_format("time", "yyyyMMdd").cast('int')). \
    show(truncate=False)

+----------+-----------------------+--------+--------+
|date      |time                   |date_ym |time_ym |
+----------+-----------------------+--------+--------+
|2014-02-28|2014-02-28 10:00:00.123|20140228|20140228|
|2016-02-29|2016-02-29 08:08:08.999|20160229|20160229|
|2017-10-31|2017-12-31 11:59:59.123|20171031|20171231|
|2019-11-30|2019-08-31 00:00:00.000|20191130|20190831|
+----------+-----------------------+--------+--------+



In [23]:
datetimesDF2. \
    withColumn("date_dt", date_format("date", "yyyyMMddHHmmss")). \
    withColumn("date_ts", date_format("time", "yyyyMMddHHmmss")). \
    show(truncate=False)

+----------+-----------------------+--------------+--------------+
|date      |time                   |date_dt       |date_ts       |
+----------+-----------------------+--------------+--------------+
|2014-02-28|2014-02-28 10:00:00.123|20140228000000|20140228100000|
|2016-02-29|2016-02-29 08:08:08.999|20160229000000|20160229080808|
|2017-10-31|2017-12-31 11:59:59.123|20171031000000|20171231115959|
|2019-11-30|2019-08-31 00:00:00.000|20191130000000|20190831000000|
+----------+-----------------------+--------------+--------------+



##### Get year and day of year using yyyyDDD format.

In [24]:
datetimesDF2. \
    withColumn("date_yd", date_format("date", "yyyyDDD").cast('int')). \
    withColumn("time_yd", date_format("time", "yyyyDDD").cast('int')). \
    show(truncate=False)

+----------+-----------------------+-------+-------+
|date      |time                   |date_yd|time_yd|
+----------+-----------------------+-------+-------+
|2014-02-28|2014-02-28 10:00:00.123|2014059|2014059|
|2016-02-29|2016-02-29 08:08:08.999|2016060|2016060|
|2017-10-31|2017-12-31 11:59:59.123|2017304|2017365|
|2019-11-30|2019-08-31 00:00:00.000|2019334|2019243|
+----------+-----------------------+-------+-------+



In [25]:
datetimesDF2. \
    withColumn("date_desc", date_format("date", "MMMM d, yyyy")). \
    show(truncate=False)

+----------+-----------------------+-----------------+
|date      |time                   |date_desc        |
+----------+-----------------------+-----------------+
|2014-02-28|2014-02-28 10:00:00.123|February 28, 2014|
|2016-02-29|2016-02-29 08:08:08.999|February 29, 2016|
|2017-10-31|2017-12-31 11:59:59.123|October 31, 2017 |
|2019-11-30|2019-08-31 00:00:00.000|November 30, 2019|
+----------+-----------------------+-----------------+



##### Get name of the week day using date.

In [26]:
datetimesDF2. \
    withColumn("day_name_abbr", date_format("date", "EE")). \
    show(truncate=False)

+----------+-----------------------+-------------+
|date      |time                   |day_name_abbr|
+----------+-----------------------+-------------+
|2014-02-28|2014-02-28 10:00:00.123|Fri          |
|2016-02-29|2016-02-29 08:08:08.999|Mon          |
|2017-10-31|2017-12-31 11:59:59.123|Tue          |
|2019-11-30|2019-08-31 00:00:00.000|Sat          |
+----------+-----------------------+-------------+



In [27]:
datetimesDF2. \
    withColumn("day_name_full", date_format("date", "EEEE")). \
    show(truncate=False)

+----------+-----------------------+-------------+
|date      |time                   |day_name_full|
+----------+-----------------------+-------------+
|2014-02-28|2014-02-28 10:00:00.123|Friday       |
|2016-02-29|2016-02-29 08:08:08.999|Monday       |
|2017-10-31|2017-12-31 11:59:59.123|Tuesday      |
|2019-11-30|2019-08-31 00:00:00.000|Saturday     |
+----------+-----------------------+-------------+



##### Dealing with Unix Timestamp

It is an integer and started from `January 1st 1970 Midnight UTC`.

Beginning time is also known as `epoch` and is incremented by 1 every second.

We can convert Unix Timestamp to regular date or timestamp and vice versa.

We can use `unix_timestamp` to convert regular date or timestamp to a unix timestamp value. For example `unix_timestamp(lit("2019-11-19 00:00:00"))`

We can use `from_unixtime` to convert unix timestamp to regular date or timestamp. For example `from_unixtime(lit(1574101800))`

In [28]:
datetimes3 = [(20140228, "2014-02-28", "2014-02-28 10:00:00.123"),
                     (20160229, "2016-02-29", "2016-02-29 08:08:08.999"),
                     (20171031, "2017-10-31", "2017-12-31 11:59:59.123"),
                     (20191130, "2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [29]:
datetimesDF3 = spark.createDataFrame(datetimes3).toDF("dateid", "date", "time")

In [30]:
datetimesDF3.show(truncate=False)

+--------+----------+-----------------------+
|dateid  |date      |time                   |
+--------+----------+-----------------------+
|20140228|2014-02-28|2014-02-28 10:00:00.123|
|20160229|2016-02-29|2016-02-29 08:08:08.999|
|20171031|2017-10-31|2017-12-31 11:59:59.123|
|20191130|2019-11-30|2019-08-31 00:00:00.000|
+--------+----------+-----------------------+



In [31]:
datetimesDF3. \
    withColumn("unix_date_id", unix_timestamp(col("dateid").cast("string"), "yyyyMMdd")). \
    withColumn("unix_date", unix_timestamp("date", "yyyy-MM-dd")). \
    withColumn("unix_time", unix_timestamp("time")). \
    show()

+--------+----------+--------------------+------------+----------+----------+
|  dateid|      date|                time|unix_date_id| unix_date| unix_time|
+--------+----------+--------------------+------------+----------+----------+
|20140228|2014-02-28|2014-02-28 10:00:...|  1393563600|1393563600|1393599600|
|20160229|2016-02-29|2016-02-29 08:08:...|  1456722000|1456722000|1456751288|
|20171031|2017-10-31|2017-12-31 11:59:...|  1509422400|1509422400|1514739599|
|20191130|2019-11-30|2019-08-31 00:00:...|  1575090000|1575090000|1567224000|
+--------+----------+--------------------+------------+----------+----------+



In [32]:
unixtimes = [(1393561800, ),
             (1456713488, ),
             (1514701799, ),
             (1567189800, )
            ]

In [33]:
unixtimesDF = spark.createDataFrame(unixtimes).toDF("unixtime")

In [34]:
unixtimesDF.show()

+----------+
|  unixtime|
+----------+
|1393561800|
|1456713488|
|1514701799|
|1567189800|
+----------+



In [35]:
unixtimesDF. \
    withColumn("date", from_unixtime("unixtime", "yyyyMMdd")). \
    withColumn("time", from_unixtime("unixtime")). \
    show()

+----------+--------+-------------------+
|  unixtime|    date|               time|
+----------+--------+-------------------+
|1393561800|20140227|2014-02-27 23:30:00|
|1456713488|20160228|2016-02-28 21:38:08|
|1514701799|20171231|2017-12-31 01:29:59|
|1567189800|20190830|2019-08-30 14:30:00|
+----------+--------+-------------------+

