Problem Statement
Generate a report to showcase the period of presence and absence of each employee as shown in expected output.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col
from pyspark.sql.functions import col, lag, sum, min, max
from pyspark.sql.window import Window

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("EmployeeStatus") \
    .getOrCreate()

# Sample Data
data = [
    ("a1", "2024-01-01", "present"),
    ("a1", "2024-01-02", "present"),
    ("a1", "2024-01-03", "present"),
    ("a1", "2024-01-04", "Absent"),
    ("a1", "2024-01-05", "present"),
    ("a1", "2024-01-06", "present"),
    ("a1", "2024-01-07", "present"),
    ("a1", "2024-01-08", "Absent"),
    ("a1", "2024-01-09", "Absent"),
    ("a1", "2024-01-10", "present"),
    ("a2", "2024-01-06", "present"),
    ("a2", "2024-01-07", "present"),
    ("a2", "2024-01-08", "Absent"),
    ("a2", "2024-01-09", "present"),
    ("a2", "2024-01-10", "Absent")
]

# Define schema for the DataFrame
schema = ["employee", "dates", "status"]

# Step 2: Create the DataFrame
df = spark.createDataFrame(data, schema)

# Step 3: Convert the string column to a date column
df = df.withColumn("dates", to_date(col("dates"), "yyyy-MM-dd"))

# Show the DataFrame with the date column
df.display()


employee,dates,status
a1,2024-01-01,present
a1,2024-01-02,present
a1,2024-01-03,present
a1,2024-01-04,Absent
a1,2024-01-05,present
a1,2024-01-06,present
a1,2024-01-07,present
a1,2024-01-08,Absent
a1,2024-01-09,Absent
a1,2024-01-10,present


Pyspark

In [0]:
from pyspark.sql.functions import col, to_date, lag, when, sum, min, max
from pyspark.sql.window import Window

# Step 4: Group consecutive records with the same status
window_spec = Window.partitionBy("employee").orderBy("dates")
df = df.withColumn("prev_status", lag("status").over(window_spec))
df = df.withColumn("group", when(col("status") != col("prev_status"), 1).otherwise(0))
df = df.withColumn("group", sum("group").over(window_spec))

# Find the start and end dates for each group
result_df = df.groupBy("employee", "group", "status").agg(
    min("dates").alias("FROM_DATE"),
    max("dates").alias("TO_DATE")
).orderBy("employee", "FROM_DATE")
final_df = result_df.select("employee", "FROM_DATE", "TO_DATE", "status")

final_df.display()


employee,FROM_DATE,TO_DATE,status
a1,2024-01-01,2024-01-03,present
a1,2024-01-04,2024-01-04,Absent
a1,2024-01-05,2024-01-07,present
a1,2024-01-08,2024-01-09,Absent
a1,2024-01-10,2024-01-10,present
a2,2024-01-06,2024-01-07,present
a2,2024-01-08,2024-01-08,Absent
a2,2024-01-09,2024-01-09,present
a2,2024-01-10,2024-01-10,Absent


Saprk SQL

In [0]:
df.createOrReplaceTempView("emp_attendance")

In [0]:
%sql
WITH cte AS (
SELECT *, CASE WHEN STATUS <> LAG(STATUS,1,'OPOUI')OVER(PARTITION BY EMPLOYEE ORDER BY DATES)THEN 1 ELSE 0  END
AS FLAG
FROM EMP_ATTENDANCE
), 
cte1 AS (
SELECT * , SUM(FLAG)OVER(PARTITION BY EMPLOYEE ORDER BY DATES)AS GRP
FROM cte
)
SELECT EMPLOYEE, MIN(DATES)AS FROM_DATE, MAX(DATES)AS TO_DATE, STATUS
FROM cte1 
GROUP BY EMPLOYEE, STATUS,GRP
ORDER BY EMPLOYEE, FROM_DATE

EMPLOYEE,FROM_DATE,TO_DATE,STATUS
a1,2024-01-01,2024-01-03,present
a1,2024-01-04,2024-01-04,Absent
a1,2024-01-05,2024-01-07,present
a1,2024-01-08,2024-01-09,Absent
a1,2024-01-10,2024-01-10,present
a2,2024-01-06,2024-01-07,present
a2,2024-01-08,2024-01-08,Absent
a2,2024-01-09,2024-01-09,present
a2,2024-01-10,2024-01-10,Absent
