# Date & Time functions

https://www.kaggle.com/datasets/willianoliveiragibin/grocery-inventory

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [None]:
#!/bin/bash
! curl -L -o inventory.zip https://www.kaggle.com/api/v1/datasets/download/willianoliveiragibin/grocery-inventory
! unzip inventory.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 50801  100 50801    0     0   120k      0 --:--:-- --:--:-- --:--:--  120k
Archive:  inventory.zip
replace Grocery_Inventory new v1.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: Grocery_Inventory new v1.csv  


In [None]:
inv = spark.read.csv("Grocery_Inventory new v1.csv", header=True, inferSchema=True)
inv.printSchema()

root
 |-- Product_Name: string (nullable = true)
 |-- Catagory: string (nullable = true)
 |-- Supplier_Name: string (nullable = true)
 |-- Warehouse_Location: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Supplier_ID: string (nullable = true)
 |-- Date_Received: string (nullable = true)
 |-- Last_Order_Date: string (nullable = true)
 |-- Expiration_Date: string (nullable = true)
 |-- Stock_Quantity: integer (nullable = true)
 |-- Reorder_Level: integer (nullable = true)
 |-- Reorder_Quantity: integer (nullable = true)
 |-- Unit_Price: string (nullable = true)
 |-- Sales_Volume: integer (nullable = true)
 |-- Inventory_Turnover_Rate: integer (nullable = true)
 |-- percentage: string (nullable = true)



In [None]:
inv.show(3)

+---------------+-------------------+-------------+--------------------+------------+-----------+-----------+-------------+---------------+---------------+--------------+-------------+----------------+----------+------------+-----------------------+----------+
|   Product_Name|           Catagory|Supplier_Name|  Warehouse_Location|      Status| Product_ID|Supplier_ID|Date_Received|Last_Order_Date|Expiration_Date|Stock_Quantity|Reorder_Level|Reorder_Quantity|Unit_Price|Sales_Volume|Inventory_Turnover_Rate|percentage|
+---------------+-------------------+-------------+--------------------+------------+-----------+-----------+-------------+---------------+---------------+--------------+-------------+----------------+----------+------------+-----------------------+----------+
|    Bell Pepper|Fruits & Vegetables|       Eimbee|20 Pennsylvania P...|Discontinued|29-017-6255|43-348-2450|     3/1/2024|       1/6/2025|      1/31/2025|            46|           64|              17|     $4.60|     

In [None]:
inv.select("Product_Name", "Date_Received", "Unit_Price").show()

+-----------------+-------------+----------+
|     Product_Name|Date_Received|Unit_Price|
+-----------------+-------------+----------+
|      Bell Pepper|     3/1/2024|     $4.60|
|    Vegetable Oil|     4/1/2024|     $2.00|
|  Parmesan Cheese|     4/1/2024|    $12.00|
|           Carrot|     5/1/2024|     $1.50|
|           Garlic|     5/1/2024|     $7.00|
|            Lemon|     5/1/2024|     $2.40|
|    Coconut Sugar|     5/1/2024|     $5.00|
|        Anchovies|     6/1/2024|    $10.00|
|           Cheese|     6/1/2024|     $9.00|
|           Yogurt|     6/1/2024|     $1.70|
|   Cheddar Cheese|     6/1/2024|     $9.00|
|      Avocado Oil|     7/1/2024|    $10.00|
|           Orange|     8/1/2024|     $2.90|
|Digestive Biscuit|     9/1/2024|     $4.00|
|      Cauliflower|     9/1/2024|     $2.50|
|             Pear|     9/1/2024|     $4.50|
|     Egg (Turkey)|    10/1/2024|     $2.50|
|   Ricotta Cheese|    10/1/2024|     $6.20|
|         Eggplant|    10/1/2024|     $3.00|
|Whole Whe

In [None]:
inv_df = inv.select("Product_Name", "Date_Received", "Unit_Price")

In [None]:
inv_df.printSchema()

root
 |-- Product_Name: string (nullable = true)
 |-- Date_Received: string (nullable = true)
 |-- Unit_Price: string (nullable = true)



In [None]:
from pyspark.sql.functions import col, to_date

#inv_df.withColumn("Date_Received", to_date(col("Date_Received"))).show(3)
#inv_df.withColumn("Date_Received", to_date(col("Date_Received"),"MM/dd/yyyy")).show(3)
inv_df.withColumn("Date_Received", to_date(col("Date_Received"),"M/d/yyyy")).show(3)

+---------------+-------------+----------+
|   Product_Name|Date_Received|Unit_Price|
+---------------+-------------+----------+
|    Bell Pepper|   2024-03-01|     $4.60|
|  Vegetable Oil|   2024-04-01|     $2.00|
|Parmesan Cheese|   2024-04-01|    $12.00|
+---------------+-------------+----------+
only showing top 3 rows



In [None]:
inv_df.withColumn("Date_Received", to_date(col("Date_Received"),"M/d/yyyy")).printSchema()

root
 |-- Product_Name: string (nullable = true)
 |-- Date_Received: date (nullable = true)
 |-- Unit_Price: string (nullable = true)



In [None]:
from pyspark.sql.functions import regexp_replace

inv_df.withColumn("Unit_Price", regexp_replace(col("Unit_Price"), r"\$", "").cast("double")).show(3)

+---------------+-------------+----------+
|   Product_Name|Date_Received|Unit_Price|
+---------------+-------------+----------+
|    Bell Pepper|     3/1/2024|       4.6|
|  Vegetable Oil|     4/1/2024|       2.0|
|Parmesan Cheese|     4/1/2024|      12.0|
+---------------+-------------+----------+
only showing top 3 rows



In [None]:
inv_df.withColumn("Unit_Price", regexp_replace(col("Unit_Price"), r"\$", "").cast("double")).printSchema()

root
 |-- Product_Name: string (nullable = true)
 |-- Date_Received: string (nullable = true)
 |-- Unit_Price: double (nullable = true)



In [None]:
inv_final = inv_df.withColumn("Date_Received", to_date(col("Date_Received"),"M/d/yyyy"))\
                  .withColumn("Unit_Price", regexp_replace(col("Unit_Price"), r"\$", "").cast("double"))

In [None]:
inv_final.printSchema()

root
 |-- Product_Name: string (nullable = true)
 |-- Date_Received: date (nullable = true)
 |-- Unit_Price: double (nullable = true)



In [None]:
from pyspark.sql.functions import current_date

inv_final.withColumn("Today", current_date()).show(3)

+---------------+-------------+----------+----------+
|   Product_Name|Date_Received|Unit_Price|     Today|
+---------------+-------------+----------+----------+
|    Bell Pepper|   2024-03-01|       4.6|2025-03-22|
|  Vegetable Oil|   2024-04-01|       2.0|2025-03-22|
|Parmesan Cheese|   2024-04-01|      12.0|2025-03-22|
+---------------+-------------+----------+----------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import current_timestamp, now

inv_final\
  .withColumn("now", now())\
  .withColumn("current_timestamp", current_timestamp())\
  .show(3)

+---------------+-------------+----------+--------------------+--------------------+
|   Product_Name|Date_Received|Unit_Price|                 now|   current_timestamp|
+---------------+-------------+----------+--------------------+--------------------+
|    Bell Pepper|   2024-03-01|       4.6|2025-03-22 23:07:...|2025-03-22 23:07:...|
|  Vegetable Oil|   2024-04-01|       2.0|2025-03-22 23:07:...|2025-03-22 23:07:...|
|Parmesan Cheese|   2024-04-01|      12.0|2025-03-22 23:07:...|2025-03-22 23:07:...|
+---------------+-------------+----------+--------------------+--------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import date_diff

inv_final\
  .withColumn("Difference", date_diff(current_date(), col("Date_Received")))\
  .show(3)

+---------------+-------------+----------+----------+
|   Product_Name|Date_Received|Unit_Price|Difference|
+---------------+-------------+----------+----------+
|    Bell Pepper|   2024-03-01|       4.6|       386|
|  Vegetable Oil|   2024-04-01|       2.0|       355|
|Parmesan Cheese|   2024-04-01|      12.0|       355|
+---------------+-------------+----------+----------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import date_format

inv_final\
  .withColumn("Date_string_formatted", date_format(col("Date_Received"), "dd.MM.yyyy"))\
  .show(3)

+---------------+-------------+----------+---------------------+
|   Product_Name|Date_Received|Unit_Price|Date_string_formatted|
+---------------+-------------+----------+---------------------+
|    Bell Pepper|   2024-03-01|       4.6|           01.03.2024|
|  Vegetable Oil|   2024-04-01|       2.0|           01.04.2024|
|Parmesan Cheese|   2024-04-01|      12.0|           01.04.2024|
+---------------+-------------+----------+---------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import date_add

inv_final\
  .withColumn("Date_plus_14", date_add(col("Date_Received"), 14))\
  .show(3)


+---------------+-------------+----------+------------+
|   Product_Name|Date_Received|Unit_Price|Date_plus_14|
+---------------+-------------+----------+------------+
|    Bell Pepper|   2024-03-01|       4.6|  2024-03-15|
|  Vegetable Oil|   2024-04-01|       2.0|  2024-04-15|
|Parmesan Cheese|   2024-04-01|      12.0|  2024-04-15|
+---------------+-------------+----------+------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import date_part, month, year, day, lit

inv_final\
  .withColumn("Year", date_part(lit('YEAR'),col('Date_Received')))\
  .withColumn("Month", date_part(lit('MONTH'),col('Date_Received')))\
  .withColumn("Day", date_part(lit('DAY'),col('Date_Received')))\
  .withColumn("Year2", year(col('Date_Received')))\
  .withColumn("Month2", month(col('Date_Received')))\
  .withColumn("Day2", day(col('Date_Received')))\
  .show(3)

+---------------+-------------+----------+----+-----+---+-----+------+----+
|   Product_Name|Date_Received|Unit_Price|Year|Month|Day|Year2|Month2|Day2|
+---------------+-------------+----------+----+-----+---+-----+------+----+
|    Bell Pepper|   2024-03-01|       4.6|2024|    3|  1| 2024|     3|   1|
|  Vegetable Oil|   2024-04-01|       2.0|2024|    4|  1| 2024|     4|   1|
|Parmesan Cheese|   2024-04-01|      12.0|2024|    4|  1| 2024|     4|   1|
+---------------+-------------+----------+----+-----+---+-----+------+----+
only showing top 3 rows



https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#datetime-functions

In [None]:
from pyspark.sql.functions import col, regexp_replace, to_date, when, lit, concat, trim

data = [
    ("05-Feb-2020",2020),
    ("Reported 30-Jan-2020",2020),
    ("9.07.2019",2019),
    ("May 2018",2018),
    ("Sep-2017",2017),
    ("November 2011",2011),
    ("19-Jul-2007.b",2007),
    ("19-Jul-2007.a",2007),
    ("19-Jul-2004 Reported to have happened \"on the weekend\"",2024),
    ("2004",2024),
    ("Early Sep-2000",2000)
]

df = spark.createDataFrame(data, ["date", "year"])



In [None]:
df.show()

In [None]:
!pip install dateparser

In [None]:
import dateparser
from pyspark.sql.functions import udf, substring, length, col, regexp_extract
from pyspark.sql.types import DateType

# Tworzenie funkcji UDF do rozpoznawania daty
def parse_date(text):
    result = dateparser.parse(text)
    return result.date() if result else None

# Tworzenie UDF
parse_date_udf = udf(parse_date, DateType())


In [None]:

df1 = df.withColumn("date", regexp_replace(col("date"), r"(?i)Reported ", ""))\
       .withColumn("date", regexp_replace(col("date"), r"(?i)Early ", ""))\
       .withColumn("date", regexp_extract(col("date"), r"(.*\d{4})", 1))

# Zastosowanie UDF na kolumnie
df_with_date = df1.withColumn("parsed_date", parse_date_udf(col("date")))
df_with_date.show(truncate=False)


In [None]:

italy_shark1 = italy_shark\
       .withColumn("date", regexp_replace(col("date"), r"(?i)Reported ", ""))\
       .withColumn("date", regexp_replace(col("date"), r"(?i)Early ", ""))\
       .withColumn("date", regexp_replace(col("date"), r"(?i)Mid ", ""))\
       .withColumn("date", regexp_replace(col("date"), r"(?i)Late ", ""))\
       .withColumn("date", regexp_extract(col("date"), r"(^.*?\d{4})", 1))\
       .withColumn("date", regexp_replace(col("date"), r".* (?=\d{4})", "")
)

# Zastosowanie UDF na kolumnie
italy_with_date = italy_shark1.withColumn("parsed_date", parse_date_udf(col("date")))
italy_with_date.show(truncate=False, n=italy_count)
