In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, TimestampType, IntegerType
from datetime import datetime
from pyspark.sql.functions import col, lag, lead, date_format, coalesce, expr
from pyspark.sql.functions import row_number, date_format, unix_timestamp
from pyspark.sql.window import Window
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Insert into emp_tbl") \
    .getOrCreate()

# Define the schema for the emp_tbl table
schema = StructType([
    StructField("id", TimestampType(), True),
    StructField("empid", IntegerType(), True)
])

# Sample data for the emp_tbl table with datetime conversion
data = [
    (datetime.strptime('2024-01-13 09:25:00', '%Y-%m-%d %H:%M:%S'), 10),
    (datetime.strptime('2024-01-13 19:35:00', '%Y-%m-%d %H:%M:%S'), 10),
    (datetime.strptime('2024-01-16 09:10:00', '%Y-%m-%d %H:%M:%S'), 10),
    (datetime.strptime('2024-01-16 18:10:00', '%Y-%m-%d %H:%M:%S'), 10),
    (datetime.strptime('2024-02-11 09:07:00', '%Y-%m-%d %H:%M:%S'), 10),
    (datetime.strptime('2024-02-11 19:20:00', '%Y-%m-%d %H:%M:%S'), 10),
    (datetime.strptime('2024-02-17 08:40:00', '%Y-%m-%d %H:%M:%S'), 17),
    (datetime.strptime('2024-02-17 18:04:00', '%Y-%m-%d %H:%M:%S'), 17),
    (datetime.strptime('2024-03-23 09:20:00', '%Y-%m-%d %H:%M:%S'), 10),
    (datetime.strptime('2024-03-23 18:30:00', '%Y-%m-%d %H:%M:%S'), 10)
]

# Create a DataFrame from the data and schema
df = spark.createDataFrame(data, schema)

# Create the emp_tbl in Spark SQL
df.createOrReplaceTempView("emp_tbl")

df.display()


id,empid
2024-01-13T09:25:00.000+0000,10
2024-01-13T19:35:00.000+0000,10
2024-01-16T09:10:00.000+0000,10
2024-01-16T18:10:00.000+0000,10
2024-02-11T09:07:00.000+0000,10
2024-02-11T19:20:00.000+0000,10
2024-02-17T08:40:00.000+0000,17
2024-02-17T18:04:00.000+0000,17
2024-03-23T09:20:00.000+0000,10
2024-03-23T18:30:00.000+0000,10


In [0]:
# Spark SQL Query to calculate total weekend hours
spark.sql("""
with st1 AS
(
	select *, LAG(id,1) OVER(partition by empid order by id) AS login_time
	FROM
	(select *
	,ROW_NUMBER() OVER(PARTITION BY empid order by id) as rn
	from emp_tbl) as tbl
)

SELECT empid , sum(total_hrs) AS Weekend_working_hours
FROM
(
	select *,
    DATEDIFF(MINUTE,login_time, id)/60.0 as total_hrs
    FROM st1
    where rn%2=0
) as tbl 
GROUP BY empid;
""").show()



+-----+---------------------+
|empid|Weekend_working_hours|
+-----+---------------------+
|   10|            38.550001|
|   17|             9.400000|
+-----+---------------------+



In [0]:
# Define window specification for lag and row_number
window_spec = Window.partitionBy("empid").orderBy("id")

# Create st1 DataFrame with login_time and row_number
st1 = df.withColumn("login_time", lag("id", 1).over(window_spec)) \
        .withColumn("rn", row_number().over(window_spec))

# Filter out rows with even row number and calculate total hours
result = st1.filter(col("rn") % 2 == 0) \
            .withColumn("total_hrs", (unix_timestamp("id") - unix_timestamp("login_time")) / 3600.0) \
            .groupBy("empid") \
            .agg({"total_hrs": "sum"}) \
            .withColumnRenamed("sum(total_hrs)", "Weekend_working_hours")

# Show the results
result.show()

+-----+---------------------+
|empid|Weekend_working_hours|
+-----+---------------------+
|   10|                38.55|
|   17|                  9.4|
+-----+---------------------+

