In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Test").master("local[*]").getOrCreate()
spark

In [0]:
data=[[1,'A','17-01-2023'],
      [2,'B','28-09-2021'],
      [3,'C','05-12-2016'],
      [4,'D',None]]
schema=['id','name','date']
df=spark.createDataFrame(data=data,schema=schema)
df.show()
df.printSchema()

+---+----+----------+
| id|name|      date|
+---+----+----------+
|  1|   A|17-01-2023|
|  2|   B|28-09-2021|
|  3|   C|05-12-2016|
|  4|   D|      null|
+---+----+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)



In [0]:
df_date=df.withColumn('date',to_date(col('date'),'dd-MM-yyyy'))
df_date.show()
df_date.printSchema()

+---+----+----------+
| id|name|      date|
+---+----+----------+
|  1|   A|2023-01-17|
|  2|   B|2021-09-28|
|  3|   C|2016-12-05|
|  4|   D|      null|
+---+----+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- date: date (nullable = true)



In [0]:
df_date2=df_date.withColumn(
    'date',when(col('date').isNull(),current_date())\
    .otherwise(col('date'))
)
df_date2.show()

+---+----+----------+
| id|name|      date|
+---+----+----------+
|  1|   A|2023-01-17|
|  2|   B|2021-09-28|
|  3|   C|2016-12-05|
|  4|   D|2024-09-19|
+---+----+----------+



In [0]:
df_date2

In [0]:
df_date_cleaned=df_date.dropna()
df_date_cleaned.show()

+---+----+----------+
| id|name|      date|
+---+----+----------+
|  1|   A|2023-01-17|
|  2|   B|2021-09-28|
|  3|   C|2016-12-05|
+---+----+----------+



In [0]:
df_date_cleaned2=df_date_cleaned.withColumn("WithTimestamp",current_timestamp())
df_date_cleaned2.show(truncate=False)

df_date_cleaned2.select(weekofyear(col('date'))).show()
df_date_cleaned2.select(next_day(col('date'),'Sunday')).show()

+---+----+----------+-----------------------+
|id |name|date      |WithTimestamp          |
+---+----+----------+-----------------------+
|1  |A   |2023-01-17|2024-09-19 05:46:23.647|
|2  |B   |2021-09-28|2024-09-19 05:46:23.647|
|3  |C   |2016-12-05|2024-09-19 05:46:23.647|
+---+----+----------+-----------------------+

+----------------+
|weekofyear(date)|
+----------------+
|               3|
|              39|
|              49|
+----------------+

+----------------------+
|next_day(date, Sunday)|
+----------------------+
|            2023-01-22|
|            2021-10-03|
|            2016-12-11|
+----------------------+



In [0]:
df_date_cleaned2.select(hour(col('WithTimestamp'))).show()
df_date_cleaned2.select(minute(col('WithTimestamp'))).show()
df_date_cleaned2.select(second(col('WithTimestamp'))).show()

+-------------------+
|hour(WithTimestamp)|
+-------------------+
|                  5|
|                  5|
|                  5|
+-------------------+

+---------------------+
|minute(WithTimestamp)|
+---------------------+
|                   55|
|                   55|
|                   55|
+---------------------+

+---------------------+
|second(WithTimestamp)|
+---------------------+
|                   26|
|                   26|
|                   26|
+---------------------+



In [0]:
df_date3=df_date2.withColumn('Days_Diff',datediff(current_date(),col('date')))
df_date4=df_date2.withColumn('Months_Diff',months_between(current_date(),col('date')))
df_date5=df_date2.withColumn('Years_Diff',months_between(current_date(),col('date'))/12)
df_date3.show()
df_date4.show()
df_date5.show()

+---+----+----------+---------+
| id|name|      date|Days_Diff|
+---+----+----------+---------+
|  1|   A|2023-01-17|      611|
|  2|   B|2021-09-28|     1087|
|  3|   C|2016-12-05|     2845|
|  4|   D|2024-09-19|        0|
+---+----+----------+---------+

+---+----+----------+-----------+
| id|name|      date|Months_Diff|
+---+----+----------+-----------+
|  1|   A|2023-01-17|20.06451613|
|  2|   B|2021-09-28|35.70967742|
|  3|   C|2016-12-05| 93.4516129|
|  4|   D|2024-09-19|        0.0|
+---+----+----------+-----------+

+---+----+----------+------------------+
| id|name|      date|        Years_Diff|
+---+----+----------+------------------+
|  1|   A|2023-01-17|1.6720430108333335|
|  2|   B|2021-09-28|2.9758064516666667|
|  3|   C|2016-12-05| 7.787634408333333|
|  4|   D|2024-09-19|               0.0|
+---+----+----------+------------------+



In [0]:
df_date_cleaned_curr_date=df_date_cleaned.withColumn('Today_date',current_date())
df_date01=df_date_cleaned_curr_date.withColumn('Years',year(current_date())-year(col('date')))

# Difference in months,ignoring full years (using modulo 12 to get remaining months)
df_date02=df_date01.withColumn('Months',expr(
        "abs(months_between(current_date(),date_add(date,Years*365))%12)"
    ).cast('int')
)

# Difference in days(remaining days after subtracting years and months)
df_date03=df_date02.withColumn('Days', 
    expr(
        "abs(datediff(current_date(),date_add(date_add(date,Years *365),Months*30)))"
    )
)

df_date03.select('date','Years','Months','Days').show(truncate=False)
df_date03_final=df_date03.withColumn('Months',expr("floor(Days/30)+Months"))\
                         .withColumn('Days',expr("floor(Days%30)"))
df_date03_final.show()

+----------+-----+------+----+
|date      |Years|Months|Days|
+----------+-----+------+----+
|2023-01-17|1    |8     |6   |
|2021-09-28|3    |0     |8   |
|2016-12-05|8    |2     |135 |
+----------+-----+------+----+

+---+----+----------+----------+-----+------+----+
| id|name|      date|Today_date|Years|Months|Days|
+---+----+----------+----------+-----+------+----+
|  1|   A|2023-01-17|2024-09-19|    1|     8|   6|
|  2|   B|2021-09-28|2024-09-19|    3|     0|   8|
|  3|   C|2016-12-05|2024-09-19|    8|     6|  15|
+---+----+----------+----------+-----+------+----+



In [0]:
df_date04=df_date03_final.withColumn("Exact",concat(col('Years'),lit('Y_'),col('Months'),lit('M_'),col('Days'),lit('D')))
df_date04.show()
df_date04.select(dayofmonth(col('date')).alias('DayOfThatMonth')).show()

+---+----+----------+----------+-----+------+----+---------+
| id|name|      date|Today_date|Years|Months|Days|    Exact|
+---+----+----------+----------+-----+------+----+---------+
|  1|   A|2023-01-17|2024-09-19|    1|     8|   6| 1Y_8M_6D|
|  2|   B|2021-09-28|2024-09-19|    3|     0|   8| 3Y_0M_8D|
|  3|   C|2016-12-05|2024-09-19|    8|     6|  15|8Y_6M_15D|
+---+----+----------+----------+-----+------+----+---------+

+--------------+
|DayOfThatMonth|
+--------------+
|            17|
|            28|
|             5|
+--------------+



In [0]:
#Filter between a year range
df_filter=df_date04.where(
    (year(col('date')) >= 2021) & (year(col('date')) <= 2023)
)
df_filter.select('id','name').show()

#year(col('date')): Extracts the year from the date column
#Condition: The conditions (year(col('date')) >= 2021) and (year(col('date')) <= 2023) are combined with &

+---+----+
| id|name|
+---+----+
|  1|   A|
|  2|   B|
+---+----+



In [0]:
#Filter between a date range

start_date='2017-01-01'
end_date='2023-12-31'

start_date_expr=to_date(lit(start_date))
end_date_expr=to_date(lit(end_date))

df_filtered=df_date04.filter(
    (col('date').isNotNull()) &
    (col('date')>=start_date_expr) &
    (col('date')<=end_date_expr)
)

df_filtered.select('id','name').show()

''' lit() is Necessary:
Column Operations: PySpark DataFrames operate with column expressions, so any value used in operations needs to be wrapped as a column.
Consistent Data Types: Using lit() ensures that constant values are treated as columns with appropriate data types, allowing them to be used correctly in expressions and comparisons.
'''

+---+----+
| id|name|
+---+----+
|  1|   A|
|  2|   B|
+---+----+



In [0]:
df_date04.show()

+---+----+----------+----------+-----+------+----+---------+
| id|name|      date|Today_date|Years|Months|Days|    Exact|
+---+----+----------+----------+-----+------+----+---------+
|  1|   A|2023-01-17|2024-09-19|    1|     8|   6| 1Y_8M_6D|
|  2|   B|2021-09-28|2024-09-19|    3|     0|   8| 3Y_0M_8D|
|  3|   C|2016-12-05|2024-09-19|    8|     6|  15|8Y_6M_15D|
+---+----+----------+----------+-----+------+----+---------+



In [0]:
df_date04.select(date_add(col('date'),1)).show()
df_date04.select(date_sub(col('date'),10)).show()
df_date04.select(add_months(col('date'),1)).show()

+-----------------+
|date_add(date, 1)|
+-----------------+
|       2023-01-18|
|       2021-09-29|
|       2016-12-06|
+-----------------+

+------------------+
|date_sub(date, 10)|
+------------------+
|        2023-01-07|
|        2021-09-18|
|        2016-11-25|
+------------------+

+-------------------+
|add_months(date, 1)|
+-------------------+
|         2023-02-17|
|         2021-10-28|
|         2017-01-05|
+-------------------+

