Użyj każdą z tych funkcji 
* `unix_timestamp()` 
* `date_format()`
* `to_unix_timestamp()`
* `from_unixtime()`
* `to_date()` 
* `to_timestamp()` 
* `from_utc_timestamp()` 
* `to_utc_timestamp()`

In [0]:
from pyspark.sql.functions import current_date, current_timestamp

kolumny = ["timestamp", "unix", "Date"]
dane = [("2015-03-22T14:13:34", 1646641525847, "May, 2021"),
        ("2015-03-22T15:03:18", 1646641557555, "Mar, 2021"),
        ("2015-03-22T14:38:39", 1646641578622, "Jan, 2021")]

dataFrame = spark.createDataFrame(dane, kolumny) \
    .withColumn("current_date", current_date()) \
    .withColumn("current_timestamp", current_timestamp())

display(dataFrame)

timestamp,unix,Date,current_date,current_timestamp
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:07:35.791+0000
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:07:35.791+0000
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:07:35.791+0000


In [0]:

dataFrame.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- unix: long (nullable = true)
 |-- Date: string (nullable = true)
 |-- current_date: date (nullable = false)
 |-- current_timestamp: timestamp (nullable = false)



## unix_timestamp(..) & cast(..)

Konwersja **string** to a **timestamp**.

Lokalizacja funkcji 
* `pyspark.sql.functions` in the case of Python
* `org.apache.spark.sql.functions` in the case of Scala & Java

## 1. Zmiana formatu wartości timestamp yyyy-MM-dd'T'HH:mm:ss 
`unix_timestamp(..)`

Dokumentacja API `unix_timestamp(..)`:
> Convert time string with given pattern (see <a href="http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html" target="_blank">SimpleDateFormat</a>) to Unix time stamp (in seconds), return null if fail.

`SimpleDataFormat` is part of the Java API and provides support for parsing and formatting date and time values.

In [0]:
from pyspark.sql.functions import unix_timestamp, col

dataFrame = dataFrame.withColumn("timestamp_convert", 
                                 unix_timestamp(col("timestamp"), "yyyy-MM-dd'T'HH:mm:ss").cast("timestamp"))

dataFrame.show(truncate=False)


+-------------------+-------------+---------+------------+-----------------------+-------------------+
|timestamp          |unix         |Date     |current_date|current_timestamp      |timestamp_convert  |
+-------------------+-------------+---------+------------+-----------------------+-------------------+
|2015-03-22T14:13:34|1646641525847|May, 2021|2025-03-17  |2025-03-17 13:07:42.311|2015-03-22 14:13:34|
|2015-03-22T15:03:18|1646641557555|Mar, 2021|2025-03-17  |2025-03-17 13:07:42.311|2015-03-22 15:03:18|
|2015-03-22T14:38:39|1646641578622|Jan, 2021|2025-03-17  |2025-03-17 13:07:42.311|2015-03-22 14:38:39|
+-------------------+-------------+---------+------------+-----------------------+-------------------+



2. Zmień format zgodnie z klasą `SimpleDateFormat`**yyyy-MM-dd HH:mm:ss**
  * a. Wyświetl schemat i dane żeby sprawdzicz czy wartości się zmieniły

In [0]:

from pyspark.sql.functions import unix_timestamp, col, date_format
zmianaFormatu = dataFrame.withColumn(
    "timestamp_formatted",
    date_format(col("timestamp"), "yyyy-MM-dd HH:mm:ss")
)

zmianaFormatu.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- unix: long (nullable = true)
 |-- Date: string (nullable = true)
 |-- current_date: date (nullable = false)
 |-- current_timestamp: timestamp (nullable = false)
 |-- timestamp_convert: timestamp (nullable = true)
 |-- timestamp_formatted: string (nullable = true)



In [0]:
#unix_timestamp
tempE = dataFrame.withColumn(
    "timestamp_formatted",
    date_format(unix_timestamp(col("timestamp"), "yyyy-MM-dd'T'HH:mm:ss").cast("timestamp"), "yyyy-MM-dd HH:mm:ss")
)
display(tempE)

timestamp,unix,Date,current_date,current_timestamp,timestamp_convert,timestamp_formatted
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:19:24.130+0000,2015-03-22T14:13:34.000+0000,2015-03-22 14:13:34
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:19:24.130+0000,2015-03-22T15:03:18.000+0000,2015-03-22 15:03:18
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:19:24.130+0000,2015-03-22T14:38:39.000+0000,2015-03-22 14:38:39


## Stwórz nowe kolumny do DataFrame z wartościami year(..), month(..), dayofyear(..)

In [0]:
#date_format
from pyspark.sql.functions import date_format

yearDate = tempE.withColumn("year", date_format(col("timestamp"), "yyyy")) \
                .withColumn("month", date_format(col("timestamp"), "MM")) \
                .withColumn("day_of_year", date_format(col("timestamp"), "D"))

display(yearDate)

timestamp,unix,Date,current_date,current_timestamp,timestamp_convert,timestamp_formatted,year,month,day_of_year
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:32:50.077+0000,2015-03-22T14:13:34.000+0000,2015-03-22 14:13:34,2015,3,81
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:32:50.077+0000,2015-03-22T15:03:18.000+0000,2015-03-22 15:03:18,2015,3,81
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:32:50.077+0000,2015-03-22T14:38:39.000+0000,2015-03-22 14:38:39,2015,3,81


In [0]:
from pyspark.sql.functions import to_date
#to_date()
toDate = tempE.withColumn("date_only", to_date(col("timestamp")))
display(toDate)

timestamp,unix,Date,current_date,current_timestamp,timestamp_convert,timestamp_formatted,date_only
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:39:46.346+0000,2015-03-22T14:13:34.000+0000,2015-03-22 14:13:34,2015-03-22
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:39:46.346+0000,2015-03-22T15:03:18.000+0000,2015-03-22 15:03:18,2015-03-22
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:39:46.346+0000,2015-03-22T14:38:39.000+0000,2015-03-22 14:38:39,2015-03-22


In [0]:
#from_unixtime()
from pyspark.sql.functions import from_unixtime

fromUnix = tempE.withColumn("from_unix", from_unixtime(col("unix") / 1000, "yyyy-MM-dd HH:mm:ss"))

display(fromUnix)

timestamp,unix,Date,current_date,current_timestamp,timestamp_convert,timestamp_formatted,from_unix
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:40:08.480+0000,2015-03-22T14:13:34.000+0000,2015-03-22 14:13:34,2022-03-07 08:25:25
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:40:08.480+0000,2015-03-22T15:03:18.000+0000,2015-03-22 15:03:18,2022-03-07 08:25:57
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:40:08.480+0000,2015-03-22T14:38:39.000+0000,2015-03-22 14:38:39,2022-03-07 08:26:18


In [0]:
#to_timestamp()
from pyspark.sql.functions import to_timestamp

toTimestamp = tempE.withColumn("to_timestamp", to_timestamp(col("timestamp"), "yyyy-MM-dd'T'HH:mm:ss"))
 
display(toTimestamp)


timestamp,unix,Date,current_date,current_timestamp,timestamp_convert,timestamp_formatted,to_timestamp
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:43:52.559+0000,2015-03-22T14:13:34.000+0000,2015-03-22 14:13:34,2015-03-22T14:13:34.000+0000
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:43:52.559+0000,2015-03-22T15:03:18.000+0000,2015-03-22 15:03:18,2015-03-22T15:03:18.000+0000
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:43:52.559+0000,2015-03-22T14:38:39.000+0000,2015-03-22 14:38:39,2015-03-22T14:38:39.000+0000


In [0]:
#to_utc_timestamp()
from pyspark.sql.functions import to_utc_timestamp

toUtcTimestamp = tempE.withColumn("to_utc_timestamp", to_utc_timestamp(col("timestamp"), "UTC"))

display(toUtcTimestamp)



timestamp,unix,Date,current_date,current_timestamp,timestamp_convert,timestamp_formatted,to_utc_timestamp
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:44:08.369+0000,2015-03-22T14:13:34.000+0000,2015-03-22 14:13:34,2015-03-22T14:13:34.000+0000
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:44:08.369+0000,2015-03-22T15:03:18.000+0000,2015-03-22 15:03:18,2015-03-22T15:03:18.000+0000
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:44:08.369+0000,2015-03-22T14:38:39.000+0000,2015-03-22 14:38:39,2015-03-22T14:38:39.000+0000


In [0]:
#from_utc_timestamp()
from pyspark.sql.functions import from_utc_timestamp
fromUtcTimestamp = toUtcTimestamp.withColumn("from_utc_timestamp", from_utc_timestamp(col("to_utc_timestamp"), "Europe/Warsaw"))
display(fromUtcTimestamp)

timestamp,unix,Date,current_date,current_timestamp,timestamp_convert,timestamp_formatted,to_utc_timestamp,from_utc_timestamp
2015-03-22T14:13:34,1646641525847,"May, 2021",2025-03-17,2025-03-17T13:46:54.902+0000,2015-03-22T14:13:34.000+0000,2015-03-22 14:13:34,2015-03-22T14:13:34.000+0000,2015-03-22T15:13:34.000+0000
2015-03-22T15:03:18,1646641557555,"Mar, 2021",2025-03-17,2025-03-17T13:46:54.902+0000,2015-03-22T15:03:18.000+0000,2015-03-22 15:03:18,2015-03-22T15:03:18.000+0000,2015-03-22T16:03:18.000+0000
2015-03-22T14:38:39,1646641578622,"Jan, 2021",2025-03-17,2025-03-17T13:46:54.902+0000,2015-03-22T14:38:39.000+0000,2015-03-22 14:38:39,2015-03-22T14:38:39.000+0000,2015-03-22T15:38:39.000+0000


In [0]:
%fs ls dbfs:/databricks-datasets/


path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,0
