In [0]:
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import StringType, IntegerType, StructField,StructType

In [0]:

data = [
 (1, 'E001', '2024-12-01', '2024-12-01 09:10:15', '2024-12-01 18:05:40', 'Asia/Kolkata'),
 (2, 'E001', '2024-12-01', '2024-12-01 19:00:00', '2024-12-01 22:30:00', 'Asia/Kolkata'),
 (3, 'E002', '2024-12-02', '2024-12-02 09:45:10', '2024-12-02 17:15:00', 'UTC'),
 (4, 'E003', '2024-02-29', '2024-02-29 08:55:00', '2024-02-29 16:40:00', 'Asia/Kolkata'),
 (5, 'E004', '2024-11-30', '2024-11-30 23:50:00', '2024-12-01 08:10:00', 'UTC'),
 (7, 'E006', '2023-12-31', '2023-12-31 23:59:59', '2024-01-01 08:00:00', 'UTC'),
 (8, 'E007', '2024-06-15', '2024-06-15 07:30:00', '2024-06-15 12:45:30', 'Asia/Kolkata')
]

mySchema = StructType([StructField("log_id",IntegerType(),True),
                       StructField("emp_id",StringType(),True),
                       StructField("work_date",StringType(),True),
                       StructField("check_in",StringType(),True),
                       StructField("check_out",StringType(),True),
                       StructField("timezone",StringType(),True)
                       ])

df = spark.createDataFrame(data,mySchema)

df.show()


In [0]:
# Lets Check Data Type Of All Columns

df.printSchema()

##### Convert Columns To Date and Timestamp Type From STRING type Columns
- in to_date() : to_date("columnName","yyyy-MM-dd") ---OR--- to_date("columnName")
- in to_timestamp() : to_timestamp("colName","yyyy-MM-dd HH:mm:ss") ---OR--- to_timestamp("colName")

In [0]:
df2 = df.withColumn("work_date",to_date("work_date","yyyy-MM-dd"))\
  .withColumn("check_in",to_timestamp("check_in","yyyy-MM-dd HH:mm:ss"))\
  .withColumn("check_out",to_timestamp("check_out","yyyy-MM-dd HH:mm:ss"))
  
df2.show()

In [0]:
df2.printSchema()

In [0]:
# If the Timestamp Type Columns have some NULL Value, then below Method Will Be USE

# from pyspark.sql.functions import to_date, expr

# df2 = df.withColumn(
#     "work_date",
#     to_date("work_date", "yyyy-MM-dd")
# ).withColumn(
#     "check_in",
#     expr("try_to_timestamp(check_in, 'yyyy-MM-dd HH:mm:ss')")
# ).withColumn(
#     "check_out",
#     expr("try_to_timestamp(check_out, 'yyyy-MM-dd HH:mm:ss')")
# )

# display(df2)

##### Adding Column Todays Date and Todays_timestamps

In [0]:
df3 = df2.withColumn("todays_date",current_date())\
  .withColumn("Current_time",current_timestamp())

df3.show()
df3.printSchema()

- Adding and Subtracting Days from Today
- Difference Between 2 Dates (Work_date - todays_date)
- Bring out Year, Month, Day Of Month From Today

In [0]:
# Adding and Subtracting 5 Days from Today

df4 = df3.withColumn("5_day_after",date_add("todays_date",5))\
  .withColumn("5_days_before",date_sub("todays_date",5))\
  .withColumn("date_difference",date_diff("todays_date","work_date"))\
  .withColumn("current_month",month("todays_date"))\
  .withColumn("current_year",year("todays_date"))\
  .withColumn("current_day",dayofmonth("todays_date"))

df4.show()


- Bring Out month, date, year, hour, minutes , seconds from current_time (timestamp)

In [0]:
df5 = df3.withColumn("month",month("current_time"))\
  .withColumn("year",year("current_time"))\
  .withColumn("Day",dayofmonth("current_time"))\
  .withColumn("hour",hour("current_time"))\
  .withColumn("minutes",minute("current_time"))\
  .withColumn("second",second("current_time"))

df5.show(truncate=False)

- Find quarter, day of week, week of year , week of month, day of year
- Find Difference Between Months (work date and Today)

In [0]:
df6 = df3.withColumn("Quarter", quarter("todays_date"))\
  .withColumn("day_of_week", dayofweek("todays_date"))\
  .withColumn("week_of_year", weekofyear(current_date()))\
  .withColumn("week_of_month", ceil(dayofmonth("todays_date") / 7))\
  .withColumn("day_of_year", dayofyear("todays_date"))\
  .withColumn("months_between", floor(months_between("todays_date","work_date")))

df6.drop("check_in", "check_out").show()

In [0]:
df.show()

- To Find the Hours of Difference , we have to use unix_timestamp()
- unix_timestamp convert time in seconds

##### Q> The Difference between check-in and checkout time in HOURS

In [0]:
df7= df.withColumn("time_diff",round((unix_timestamp("check_out") - unix_timestamp("check_in"))/3600,2))
df7.show()