In [1]:
import java.sql.{Date,Timestamp}

In [2]:
spark.version

2.4.6

## to date

In [3]:
import org.apache.spark.sql.functions.to_date

val df = Seq(
    ("notebook","2019-01-01"),
    ("notebook", "2019-01-10"),
    ("small_phone", "2019-01-15"),
    ("small_phone", "2019-01-30")
).toDF("device", "purchase_date").sort("device","purchase_date")

df.dtypes

df = [device: string, purchase_date: string]


Array((device,StringType), (purchase_date,StringType))

In [4]:
df.withColumn("purchase_date",to_date($"purchase_date")).dtypes

Array((device,StringType), (purchase_date,DateType))

## to datetime

In [5]:
val df = Seq(
    ("notebook","2019-01-01 00:00:00"),
    ("notebook", "2019-01-10 13:00:00"),
    ("small_phone", "2019-01-15 12:00:00"),
    ("small_phone", "2019-01-30 09:30:00")
).toDF("device", "purchase_time").sort("device","purchase_time")

df = [device: string, purchase_time: string]


[device: string, purchase_time: string]

In [6]:
df.dtypes

Array((device,StringType), (purchase_time,StringType))

In [7]:
import org.apache.spark.sql.functions.to_timestamp

In [8]:
df.withColumn("purchase_time",to_timestamp($"purchase_time")).dtypes

Array((device,StringType), (purchase_time,TimestampType))

## to datetime custom format

In [9]:
val df = Seq(
    ("notebook","27/12/2019 12:00"),
    ("notebook", "01/12/2019 00:00"),
    ("small_phone", "23/01/2019 12:00"),
    ("small_phone", "27/12/2019 12:00")
).toDF("device", "purchase_time").sort("device","purchase_time")

df.dtypes

df = [device: string, purchase_time: string]


Array((device,StringType), (purchase_time,StringType))

In [10]:
%%dataframe
df.withColumn("purchase_time",to_timestamp($"purchase_time"))

device,purchase_time
notebook,
notebook,
small_phone,
small_phone,


In [11]:
%%dataframe
df.withColumn("purchase_time",to_timestamp($"purchase_time","d/M/y H:m"))

device,purchase_time
notebook,2019-12-01 00:00:00.0
notebook,2019-12-27 12:00:00.0
small_phone,2019-01-23 12:00:00.0
small_phone,2019-12-27 12:00:00.0


## timestamp to date

In [12]:
val df = Seq(
    ("notebook",Timestamp.valueOf("2019-01-29 12:00:00")),
    ("notebook", Timestamp.valueOf("2019-01-01 00:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-15 23:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-01 09:00:00"))
).toDF("device", "purchase_time").sort("device","purchase_time")

df.dtypes

df = [device: string, purchase_time: timestamp]


Array((device,StringType), (purchase_time,TimestampType))

In [13]:
%%dataframe
df.withColumn("purchase_date",to_date($"purchase_time"))

device,purchase_time,purchase_date
notebook,2019-01-01 00:00:00.0,2019-01-01
notebook,2019-01-29 12:00:00.0,2019-01-29
small_phone,2019-01-01 09:00:00.0,2019-01-01
small_phone,2019-01-15 23:00:00.0,2019-01-15


## date to timestamp with zero hours

In [14]:
import java.sql.Date
import org.apache.spark.sql.functions.to_timestamp

val df = Seq(
    ("notebook",Date.valueOf("2019-01-29")),
    ("notebook", Date.valueOf("2019-01-01")),
    ("small_phone", Date.valueOf("2019-01-15")),
    ("small_phone", Date.valueOf("2019-01-01"))
).toDF("device", "purchase_date").sort("device","purchase_date")

df = [device: string, purchase_date: date]


[device: string, purchase_date: date]

In [15]:
%%dataframe
df

device,purchase_date
notebook,2019-01-01
notebook,2019-01-29
small_phone,2019-01-01
small_phone,2019-01-15


In [16]:
%%dataframe
df.withColumn("purchase_time",to_timestamp($"purchase_date"))

device,purchase_date,purchase_time
notebook,2019-01-01,2019-01-01 00:00:00.0
notebook,2019-01-29,2019-01-29 00:00:00.0
small_phone,2019-01-01,2019-01-01 00:00:00.0
small_phone,2019-01-15,2019-01-15 00:00:00.0


## custom date/timestamp formatting

In [17]:
import java.sql.Timestamp
import org.apache.spark.sql.functions.date_format

val df = Seq(
    ("notebook",Timestamp.valueOf("2019-01-29 12:00:00")),
    ("notebook", Timestamp.valueOf("2019-01-01 00:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-15 23:00:00")),
    ("small_phone", Timestamp.valueOf("2019-01-01 09:00:00"))
).toDF("device", "purchase_time").sort("device","purchase_time")

df = [device: string, purchase_time: timestamp]


[device: string, purchase_time: timestamp]

In [18]:
%%dataframe
df.withColumn("formatted_purchase_time",date_format($"purchase_time","y-MM"))

device,purchase_time,formatted_purchase_time
notebook,2019-01-01 00:00:00.0,2019-01
notebook,2019-01-29 12:00:00.0,2019-01
small_phone,2019-01-01 09:00:00.0,2019-01
small_phone,2019-01-15 23:00:00.0,2019-01


## add sub

In [19]:
import org.apache.spark.sql.functions.{date_add,date_sub}

// note that the dates are just string
val df = Seq(
    ("notebook","2019-01-29 12:00:00"),
    ("notebook", "2019-01-01 00:00:00"),
    ("small_phone","2019-01-15 23:00:00"),
    ("small_phone", "2019-01-01 09:00:00")
).toDF("device", "purchase_time").sort("device","purchase_time")

df = [device: string, purchase_time: string]


[device: string, purchase_time: string]

In [20]:
%%dataframe
df.withColumn("plus_2_days",date_add($"purchase_time",2))

device,purchase_time,plus_2_days
notebook,2019-01-01 00:00:00,2019-01-03
notebook,2019-01-29 12:00:00,2019-01-31
small_phone,2019-01-01 09:00:00,2019-01-03
small_phone,2019-01-15 23:00:00,2019-01-17


## datediff

In [21]:
import org.apache.spark.sql.functions.datediff

// note that the dates are just strings
val df = Seq(
    ("notebook","2019-01-29", "2019-02-10"),
    ("notebook", "2019-01-01","2019-01-15"),
    ("small_phone","2019-01-15","2019-01-05"),
    ("small_phone", "2019-01-01","2019-01-20")
).toDF("device", "purchase_date", "arrival_date").sort("device","purchase_date")

df = [device: string, purchase_date: string ... 1 more field]


[device: string, purchase_date: string ... 1 more field]

In [22]:
%%dataframe
df

device,purchase_date,arrival_date
notebook,2019-01-01,2019-01-15
notebook,2019-01-29,2019-02-10
small_phone,2019-01-01,2019-01-20
small_phone,2019-01-15,2019-01-05


In [23]:
%%dataframe
df.withColumn("days_to_arrive",datediff($"arrival_date",$"purchase_date"))

device,purchase_date,arrival_date,days_to_arrive
notebook,2019-01-01,2019-01-15,14
notebook,2019-01-29,2019-02-10,12
small_phone,2019-01-01,2019-01-20,19
small_phone,2019-01-15,2019-01-05,-10


## difference in seconds

In [30]:
import org.apache.spark.sql.functions.unix_timestamp

val df = Seq(
    ("foo", "2019-01-01 00:00:00", "2019-01-01 01:00:00"), // 1 hour apart
    ("bar", "2019-01-01 00:00:00", "2019-01-02 00:00:00"), // 24 hours apart
    ("baz", "2019-01-01 00:00:00", "2019-01-07 00:00:00")  // 7 days apart
).toDF("col1", "purchase_time", "arrival_time").sort("col1","purchase_time")

df = [col1: string, purchase_time: string ... 1 more field]


lastException: Throwable = null


[col1: string, purchase_time: string ... 1 more field]

In [31]:
%%dataframe
df

col1,purchase_time,arrival_time
bar,2019-01-01 00:00:00,2019-01-02 00:00:00
baz,2019-01-01 00:00:00,2019-01-07 00:00:00
foo,2019-01-01 00:00:00,2019-01-01 01:00:00


In [35]:
%%dataframe
df.withColumn("diff_in_seconds", unix_timestamp($"arrival_time")-unix_timestamp($"purchase_time"))

col1,purchase_time,arrival_time,diff_in_seconds
bar,2019-01-01 00:00:00,2019-01-02 00:00:00,86400
baz,2019-01-01 00:00:00,2019-01-07 00:00:00,518400
foo,2019-01-01 00:00:00,2019-01-01 01:00:00,3600
