# Ex-2090 - Date expressions


In [1]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0   3314      0  0:00:01  0:00:01 --:--:--  3314
Archive:  data.zip
replace amazon_sales_data 2025.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: amazon_sales_data 2025.csv  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [4]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [5]:
from pyspark.sql.functions import col, to_date, current_date, datediff, dayofweek, month, date_format

# 2. Convert `date` column to proper date format (assuming 'yy-MM-dd' format)
df = df.withColumn("date", to_date(col("date"), "yy-MM-dd")).select("date")
df.show()
df.printSchema()

+----------+
|      date|
+----------+
|2014-03-25|
|2020-03-25|
|2015-02-25|
|2019-02-25|
|2010-03-25|
|2014-03-25|
|2018-03-25|
|2002-03-25|
|2008-03-25|
|2012-03-25|
|2017-02-25|
|2013-03-25|
|2001-03-25|
|2004-03-25|
|2020-02-25|
|2026-02-25|
|2001-04-25|
|2010-02-25|
|2022-03-25|
|2007-03-25|
+----------+
only showing top 20 rows

root
 |-- date: date (nullable = true)



In [6]:
# 3. Add `day_of_week` column with weekday name
df = df.withColumn("day_of_week", date_format(col("date"), "EEEE"))
df.show()

+----------+-----------+
|      date|day_of_week|
+----------+-----------+
|2014-03-25|    Tuesday|
|2020-03-25|  Wednesday|
|2015-02-25|  Wednesday|
|2019-02-25|     Monday|
|2010-03-25|   Thursday|
|2014-03-25|    Tuesday|
|2018-03-25|     Sunday|
|2002-03-25|     Monday|
|2008-03-25|    Tuesday|
|2012-03-25|     Sunday|
|2017-02-25|   Saturday|
|2013-03-25|     Monday|
|2001-03-25|     Sunday|
|2004-03-25|   Thursday|
|2020-02-25|    Tuesday|
|2026-02-25|  Wednesday|
|2001-04-25|  Wednesday|
|2010-02-25|   Thursday|
|2022-03-25|     Friday|
|2007-03-25|     Sunday|
+----------+-----------+
only showing top 20 rows



In [7]:
# 4. Add `today` column with the current date
df = df.withColumn("today", current_date())
df.show()

+----------+-----------+----------+
|      date|day_of_week|     today|
+----------+-----------+----------+
|2014-03-25|    Tuesday|2025-05-03|
|2020-03-25|  Wednesday|2025-05-03|
|2015-02-25|  Wednesday|2025-05-03|
|2019-02-25|     Monday|2025-05-03|
|2010-03-25|   Thursday|2025-05-03|
|2014-03-25|    Tuesday|2025-05-03|
|2018-03-25|     Sunday|2025-05-03|
|2002-03-25|     Monday|2025-05-03|
|2008-03-25|    Tuesday|2025-05-03|
|2012-03-25|     Sunday|2025-05-03|
|2017-02-25|   Saturday|2025-05-03|
|2013-03-25|     Monday|2025-05-03|
|2001-03-25|     Sunday|2025-05-03|
|2004-03-25|   Thursday|2025-05-03|
|2020-02-25|    Tuesday|2025-05-03|
|2026-02-25|  Wednesday|2025-05-03|
|2001-04-25|  Wednesday|2025-05-03|
|2010-02-25|   Thursday|2025-05-03|
|2022-03-25|     Friday|2025-05-03|
|2007-03-25|     Sunday|2025-05-03|
+----------+-----------+----------+
only showing top 20 rows



In [8]:
# 5. Calculate the number of days between `date` and `today`
df = df.withColumn("days_difference", datediff(col("today"), col("date")))
df.show()

+----------+-----------+----------+---------------+
|      date|day_of_week|     today|days_difference|
+----------+-----------+----------+---------------+
|2014-03-25|    Tuesday|2025-05-03|           4057|
|2020-03-25|  Wednesday|2025-05-03|           1865|
|2015-02-25|  Wednesday|2025-05-03|           3720|
|2019-02-25|     Monday|2025-05-03|           2259|
|2010-03-25|   Thursday|2025-05-03|           5518|
|2014-03-25|    Tuesday|2025-05-03|           4057|
|2018-03-25|     Sunday|2025-05-03|           2596|
|2002-03-25|     Monday|2025-05-03|           8440|
|2008-03-25|    Tuesday|2025-05-03|           6248|
|2012-03-25|     Sunday|2025-05-03|           4787|
|2017-02-25|   Saturday|2025-05-03|           2989|
|2013-03-25|     Monday|2025-05-03|           4422|
|2001-03-25|     Sunday|2025-05-03|           8805|
|2004-03-25|   Thursday|2025-05-03|           7709|
|2020-02-25|    Tuesday|2025-05-03|           1894|
|2026-02-25|  Wednesday|2025-05-03|           -298|
|2001-04-25|

In [9]:
# 6. Extract month number into `MonthNumber`
df = df.withColumn("MonthNumber", month(col("date")))
df.show()

+----------+-----------+----------+---------------+-----------+
|      date|day_of_week|     today|days_difference|MonthNumber|
+----------+-----------+----------+---------------+-----------+
|2014-03-25|    Tuesday|2025-05-03|           4057|          3|
|2020-03-25|  Wednesday|2025-05-03|           1865|          3|
|2015-02-25|  Wednesday|2025-05-03|           3720|          2|
|2019-02-25|     Monday|2025-05-03|           2259|          2|
|2010-03-25|   Thursday|2025-05-03|           5518|          3|
|2014-03-25|    Tuesday|2025-05-03|           4057|          3|
|2018-03-25|     Sunday|2025-05-03|           2596|          3|
|2002-03-25|     Monday|2025-05-03|           8440|          3|
|2008-03-25|    Tuesday|2025-05-03|           6248|          3|
|2012-03-25|     Sunday|2025-05-03|           4787|          3|
|2017-02-25|   Saturday|2025-05-03|           2989|          2|
|2013-03-25|     Monday|2025-05-03|           4422|          3|
|2001-03-25|     Sunday|2025-05-03|     

In [10]:
# 7. Add `MonthName` column with the name of the month
df = df.withColumn("MonthName", date_format(col("date"), "MMMM"))
df.show()

+----------+-----------+----------+---------------+-----------+---------+
|      date|day_of_week|     today|days_difference|MonthNumber|MonthName|
+----------+-----------+----------+---------------+-----------+---------+
|2014-03-25|    Tuesday|2025-05-03|           4057|          3|    March|
|2020-03-25|  Wednesday|2025-05-03|           1865|          3|    March|
|2015-02-25|  Wednesday|2025-05-03|           3720|          2| February|
|2019-02-25|     Monday|2025-05-03|           2259|          2| February|
|2010-03-25|   Thursday|2025-05-03|           5518|          3|    March|
|2014-03-25|    Tuesday|2025-05-03|           4057|          3|    March|
|2018-03-25|     Sunday|2025-05-03|           2596|          3|    March|
|2002-03-25|     Monday|2025-05-03|           8440|          3|    March|
|2008-03-25|    Tuesday|2025-05-03|           6248|          3|    March|
|2012-03-25|     Sunday|2025-05-03|           4787|          3|    March|
|2017-02-25|   Saturday|2025-05-03|   