In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Spark Time and Dates').getOrCreate()

In [2]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType
import pyspark.sql.functions as F
from pyspark.sql import types as T

In [34]:
rows = [
    Row("2020-01-03"),
    Row("2020 01 10"),
    Row("2020 Jan 10"),
    Row("Sat, 11 Jan 2020"),
]

myrdd = spark.sparkContext.parallelize(rows)

schema = T.StructType(
    [
        T.StructField(name="date_str", dataType=T.StringType(), nullable=True)
    ]
)

df = spark.createDataFrame(myrdd, schema)

In [28]:
df.show()

+----------------+
|        date_str|
+----------------+
|      2020-01-03|
|      2020 01 10|
|     2020 Jan 10|
|Sat, 11 Jan 2020|
+----------------+



In [5]:
df.printSchema()

root
 |-- date_str: string (nullable = true)



In [21]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [37]:
df = df.withColumn(
    "date",
    F.when(
        F.to_date(F.col("date_str"), "yyyy-MM-dd").isNotNull(),
        F.to_date(F.col("date_str"), "yyyy-MM-dd")
    ).otherwise(
        F.when(
            F.to_date(F.col("date_str"), "yyyy MMMM dd").isNotNull(),
            F.to_date(F.col("date_str"), "yyyy MMMM dd")
        ).otherwise(
            F.when(
                F.to_date(F.col("date_str"), "E, dd MMMM yy").isNotNull(),
                F.to_date(F.col("date_str"), "E, dd MMMM yy")
            ).otherwise(
                F.when(
                    F.to_date(F.col("date_str"), "yyyy MM dd").isNotNull(),
                    F.to_date(F.col("date_str"), "yyyy MM dd")
                )
            )
        )
    )
)

In [38]:
df.show()

+----------------+----------+
|        date_str|      date|
+----------------+----------+
|      2020-01-03|2020-01-03|
|      2020 01 10|2020-01-10|
|     2020 Jan 10|2020-01-10|
|Sat, 11 Jan 2020|2020-01-11|
+----------------+----------+



In [39]:
df = df.withColumn(
    "timestamp",
    F.when(
        F.to_timestamp(F.col("date_str"), "yyyy-MM-dd").isNotNull(),
        F.to_timestamp(F.col("date_str"), "yyyy-MM-dd")
    ).otherwise(
        F.when(
            F.to_timestamp(F.col("date_str"), "yyyy MMMM dd").isNotNull(),
            F.to_timestamp(F.col("date_str"), "yyyy MMMM dd")
        ).otherwise(
            F.when(
                F.to_timestamp(F.col("date_str"), "E, dd MMMM yy").isNotNull(),
                F.to_timestamp(F.col("date_str"), "E, dd MMMM yy")
            ).otherwise(
                F.when(
                    F.to_timestamp(F.col("date_str"), "yyyy MM dd").isNotNull(),
                    F.to_timestamp(F.col("date_str"), "yyyy MM dd")
                )
            )
        )
    )
)

In [None]:
df.show()