In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive
from os import truncate
# Mount Google Drive with a longer timeout
# drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

# df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
# employeeSechema = StructType([
#     StructField("ID",IntegerType() ,True),
#     StructField("Name",StringType() ,True),
#     StructField("Age",IntegerType() ,True),
#     StructField("Salary",FloatType() ,True),
#     StructField("Joining_Date",DateType() ,True),
#     StructField("Department",StringType() ,True),
#     StructField("Performance_Rating",IntegerType() ,True),
#     StructField("Email",StringType() ,True),
#     StructField("Address",StringType() ,True),
#     StructField("Phone",StringType() ,True)

# ])
# # Load the DataFrame with the defined schema
# #df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
# df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
# df.printSchema()
# df.show(50)

## Date Function in Dataframe – Part 1

## Code Explanation with Notes
1. Creating a Spark Session:
  * We begin by creating a Spark session to run the PySpark operations.
2. Generating a DataFrame:
  * Using spark.range(10) creates a DataFrame with 10 rows and a single column (id) with numbers ranging from 0 to 9.
  * Two additional columns are added:
    * today: Contains the current date using current_date().
    * now: Contains the current timestamp using current_timestamp().
3. Date Manipulation Functions:
  * date_add: Adds a specified number of days to the date.
  * date_sub: Subtracts a specified number of days from the date.
  * datediff: Returns the difference in days between two dates.
  * months_between: Returns the number of months between two dates.





In [4]:
dateDf = spark.range(10).withColumn("Today", current_date()).withColumn("Now", current_timestamp())
dateDf.show(truncate=False)

+---+----------+--------------------------+
|id |Today     |Now                       |
+---+----------+--------------------------+
|0  |2025-03-27|2025-03-27 16:25:33.293214|
|1  |2025-03-27|2025-03-27 16:25:33.293214|
|2  |2025-03-27|2025-03-27 16:25:33.293214|
|3  |2025-03-27|2025-03-27 16:25:33.293214|
|4  |2025-03-27|2025-03-27 16:25:33.293214|
|5  |2025-03-27|2025-03-27 16:25:33.293214|
|6  |2025-03-27|2025-03-27 16:25:33.293214|
|7  |2025-03-27|2025-03-27 16:25:33.293214|
|8  |2025-03-27|2025-03-27 16:25:33.293214|
|9  |2025-03-27|2025-03-27 16:25:33.293214|
+---+----------+--------------------------+



### 2. date_add and date_sub:

In [6]:
dateDf.select(
    date_sub(col("Today"), 5).alias("date_sub_5_days"),
    date_add(col("Today"), 5).alias("date_add_5_days")
  ).show()

+---------------+---------------+
|date_sub_5_days|date_add_5_days|
+---------------+---------------+
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
|     2025-03-22|     2025-04-01|
+---------------+---------------+



### 3. Datediff:
  * datediff(col("week_ago"), col("today")): Calculates the difference in days between the current date and 7 days ago (i.e., -7).


In [7]:
# Calculate the days diffrence between "today" and "week_ago" (7 days ago)
dateDf.withColumn("week_ago", date_sub(col("today"),7)).select(datediff(col("week_ago"), col("today")).alias("days_difference")).show()

+---------------+
|days_difference|
+---------------+
|             -7|
|             -7|
|             -7|
|             -7|
|             -7|
|             -7|
|             -7|
|             -7|
|             -7|
|             -7|
+---------------+



### 4. months_between:
* months_between(to_date(lit("2016-01-01")), to_date(lit("2017-01-01")): Calculates the number of months between January 1, 2016, and January 1, 2017, which is -12 months because start_date is earlier than end_date.

In [9]:
# Calculate the number of months between two specific dates
dateDf.select(
    to_date(lit("2016-01-01")).alias("start_date"),
    to_date(lit("2017-01-01")).alias("end_date")
).select(months_between(col("start_date"),col("end_date")).alias("months_between")).show()



+--------------+
|months_between|
+--------------+
|         -12.0|
|         -12.0|
|         -12.0|
|         -12.0|
|         -12.0|
|         -12.0|
|         -12.0|
|         -12.0|
|         -12.0|
|         -12.0|
+--------------+



## Date Function in Dataframe – Part 2

### 1. Default Date Parsing (to_date):
  * When using to_date(), the default date format is yyyy-MM-dd.
  * If the format of the string does not match this, PySpark returns null for invalid date parsing.

In [10]:
dateDf.select(
    to_date(lit("2016-20-12")).alias("incorrect_date"),
    to_date(lit("2025-12-11")).alias("correct_date")
).show()

+--------------+------------+
|incorrect_date|correct_date|
+--------------+------------+
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
|          NULL|  2025-12-11|
+--------------+------------+



### 2. Handling Custom Date Formats:
  * You can specify a custom date format using the to_date function by providing a format string, such as yyyy-dd-MM.
  * This allows PySpark to correctly parse the dates that deviate from the default format.

In [None]:
dateFormat = "yyyy-dd-MM"
cleanDateDf = spark.range(5).select(
    to_date(lit()),
    to_date(),

)