In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("EmployeeTimesheet").getOrCreate()

In [0]:
# Data Ingestion & Schema Handling
# 1. Load the CSV using inferred schema.
df_inferred = spark.read.option("header", True).csv("/Volumes/workspace/ecommerce/csv_data/employee_timesheet.csv", inferSchema=True)
df_inferred.show()

# 2. Load the same file with schema explicitly defined.
schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True)
])

df = spark.read.option("header", True).schema(schema).csv("/Volumes/workspace/ecommerce/csv_data/employee_timesheet.csv")
df.show()

# 3. Add a new column Weekday extracted from WorkDate .
df = df.withColumn("Weekday", date_format("WorkDate", "EEEE"))
df.show()

+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate|Location |  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|01-05-2024|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|01-05-2024|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|02-05-2024|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|03-05-2024|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|03-05-2024|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|04-05-2024|   Mumbai|Remote|
+----------+-----+----------+-------+---------+----------+---------+------+

+----------+-----+----------+-------+---------+--------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+--------+---------+------+
|      E101|Anita

In [0]:
# Aggregations & Grouping
# 4. Calculate total work hours by employee.
df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours")).show()

# 5. Calculate average work hours per department.
df.groupBy("Department").agg(avg("WorkHours").alias("AvgHours")).show()

# 6. Get top 2 employees by total hours using window function.
window_spec = Window.orderBy(desc("TotalHours"))
df_total = df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours"))
df_total.withColumn("Rank", rank().over(window_spec)).filter(col("Rank") <= 2).show()

+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E101|Anita|        17|
|      E102|  Raj|        15|
+----------+-----+----------+

+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        HR|              7.5|
|        IT|7.666666666666667|
|   Finance|              5.0|
+----------+-----------------+





+----------+-----+----------+----+
|EmployeeID| Name|TotalHours|Rank|
+----------+-----+----------+----+
|      E101|Anita|        17|   1|
|      E102|  Raj|        15|   2|
+----------+-----+----------+----+



In [0]:
df.printSchema()

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)
 |-- Weekday: string (nullable = true)



In [0]:
# Date Operations
# 7. Filter entries where WorkDate falls on a weekend.
# 1 = Sunday, 7 = Saturday
from pyspark.sql.functions import dayofweek

df.filter(dayofweek("WorkDate").isin([1, 7])).show()

# 8. Calculate running total of hours per employee using window function.
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

window_emp = Window.partitionBy("EmployeeID").orderBy("WorkDate").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df = df.withColumn("RunningTotalHours", sum("WorkHours").over(window_emp))
df.show()


+----------+----+----------+-------+---------+--------+--------+----+-------+
|EmployeeID|Name|Department|Project|WorkHours|WorkDate|Location|Mode|Weekday|
+----------+----+----------+-------+---------+--------+--------+----+-------+
+----------+----+----------+-------+---------+--------+--------+----+-------+

+----------+-----+----------+-------+---------+--------+---------+------+-------+-----------------+
|EmployeeID| Name|Department|Project|WorkHours|WorkDate| Location|  Mode|Weekday|RunningTotalHours|
+----------+-----+----------+-------+---------+--------+---------+------+-------+-----------------+
|      E101|Anita|        IT|  Alpha|        8|    NULL|Bangalore|Remote|   NULL|                8|
|      E101|Anita|        IT|  Alpha|        9|    NULL|Bangalore|Remote|   NULL|               17|
|      E102|  Raj|        HR|   Beta|        7|    NULL|   Mumbai|Onsite|   NULL|                7|
|      E102|  Raj|        HR|   Beta|        8|    NULL|   Mumbai|Remote|   NULL|      

In [0]:
# Joining DataFrames
# 9. Create department_location.csv :
# Department,DeptHead
# IT,Anand
# HR,Shruti
# Finance,Kamal
df_dept = spark.read.option("header", True).csv("/Volumes/workspace/ecommerce/csv_data/department_location.csv", inferSchema=True)
df_dept.show()

# 10. Join with timesheet data and list all employees with their DeptHead.
df_joined = df.join(df_dept, on="Department", how="left")
df_joined.select("EmployeeID", "Name", "Department", "DeptHead").show()

+----------+--------+
|Department|DeptHead|
+----------+--------+
|        IT|   Anand|
|        HR|  Shruti|
|   Finance|   Kamal|
+----------+--------+

+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+



In [0]:
# Pivot & Unpivot
# 11. Pivot table: total hours per employee per project.
df.groupBy("EmployeeID").pivot("Project").agg(sum("WorkHours")).show()

# 12. Unpivot example: Convert mode-specific hours into rows.
pivot_df = df.groupBy("EmployeeID", "Mode").agg(sum("WorkHours").alias("ModeHours"))
pivot_df.show()

+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
|      E103|    5|NULL| NULL|
+----------+-----+----+-----+

+----------+------+---------+
|EmployeeID|  Mode|ModeHours|
+----------+------+---------+
|      E103|Remote|        5|
|      E101|Remote|       17|
|      E102|Onsite|        7|
|      E102|Remote|        8|
|      E104|Onsite|        6|
+----------+------+---------+



In [0]:
# UDF & Conditional Logic
# 13. Create a UDF to classify work hours:

# def workload_tag(hours):
# if hours >= 8: return "Full"
# elif hours >= 4: return "Partial"
# else: return "Light"
from pyspark.sql.functions import udf

def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"

workload_udf = udf(workload_tag, StringType())

# 14. Add a column WorkloadCategory using this UDF.
df = df.withColumn("WorkloadCategory", workload_udf("WorkHours"))
df.show()

+----------+-----+----------+-------+---------+--------+---------+------+-------+-----------------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|WorkDate| Location|  Mode|Weekday|RunningTotalHours|WorkloadCategory|
+----------+-----+----------+-------+---------+--------+---------+------+-------+-----------------+----------------+
|      E101|Anita|        IT|  Alpha|        8|    NULL|Bangalore|Remote|   NULL|                8|            Full|
|      E101|Anita|        IT|  Alpha|        9|    NULL|Bangalore|Remote|   NULL|               17|            Full|
|      E102|  Raj|        HR|   Beta|        7|    NULL|   Mumbai|Onsite|   NULL|                7|         Partial|
|      E102|  Raj|        HR|   Beta|        8|    NULL|   Mumbai|Remote|   NULL|               15|            Full|
|      E103| John|   Finance|  Alpha|        5|    NULL|    Delhi|Remote|   NULL|                5|         Partial|
|      E104|Meena|        IT|  Gamma|        6|    NULL|Hyderaba

In [0]:
# Nulls and Cleanup
# 15. Introduce some nulls in Mode column.
from pyspark.sql.functions import when
df = df.withColumn("Mode", when(col("EmployeeID") == "E103", None).otherwise(col("Mode")))
df.show()

# 16. Fill nulls with "Not Provided".
df = df.fillna({"Mode": "Not Provided"})
df.show()

# 17. Drop rows where WorkHours < 4.
df = df.filter(col("WorkHours") >= 4)
df.show()

+----------+-----+----------+-------+---------+--------+---------+------+-------+-----------------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|WorkDate| Location|  Mode|Weekday|RunningTotalHours|WorkloadCategory|
+----------+-----+----------+-------+---------+--------+---------+------+-------+-----------------+----------------+
|      E101|Anita|        IT|  Alpha|        8|    NULL|Bangalore|Remote|   NULL|                8|            Full|
|      E101|Anita|        IT|  Alpha|        9|    NULL|Bangalore|Remote|   NULL|               17|            Full|
|      E102|  Raj|        HR|   Beta|        7|    NULL|   Mumbai|Onsite|   NULL|                7|         Partial|
|      E102|  Raj|        HR|   Beta|        8|    NULL|   Mumbai|Remote|   NULL|               15|            Full|
|      E103| John|   Finance|  Alpha|        5|    NULL|    Delhi|  NULL|   NULL|                5|         Partial|
|      E104|Meena|        IT|  Gamma|        6|    NULL|Hyderaba

In [0]:
# Advanced Conditions
# 18. Use when-otherwise to mark employees as "Remote Worker" if >80% entries are
# Remote.
remote_counts = df.groupBy("EmployeeID").agg(
    (sum(when(col("Mode") == "Remote", 1).otherwise(0)) / count("*")).alias("RemoteRatio")
)
df = df.join(remote_counts, on="EmployeeID", how="left")
df = df.withColumn("WorkerType", when(col("RemoteRatio") > 0.8, "Remote Worker").otherwise("Hybrid/Onsite"))
df.show()

# 19. Add a new column ExtraHours where hours > 8.
df = df.withColumn("ExtraHours", when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0))
df.show()

+----------+-----+----------+-------+---------+--------+---------+------------+-------+-----------------+----------------+-----------+-------------+
|EmployeeID| Name|Department|Project|WorkHours|WorkDate| Location|        Mode|Weekday|RunningTotalHours|WorkloadCategory|RemoteRatio|   WorkerType|
+----------+-----+----------+-------+---------+--------+---------+------------+-------+-----------------+----------------+-----------+-------------+
|      E101|Anita|        IT|  Alpha|        8|    NULL|Bangalore|      Remote|   NULL|                8|            Full|        1.0|Remote Worker|
|      E101|Anita|        IT|  Alpha|        9|    NULL|Bangalore|      Remote|   NULL|               17|            Full|        1.0|Remote Worker|
|      E102|  Raj|        HR|   Beta|        7|    NULL|   Mumbai|      Onsite|   NULL|                7|         Partial|        0.5|Hybrid/Onsite|
|      E102|  Raj|        HR|   Beta|        8|    NULL|   Mumbai|      Remote|   NULL|               15| 

In [0]:
print(df.columns)
print(len(df.columns))  # Should print 14


['EmployeeID', 'Name', 'Department', 'Project', 'WorkHours', 'WorkDate', 'Location', 'Mode', 'Weekday', 'RunningTotalHours', 'WorkloadCategory', 'RemoteRatio', 'WorkerType', 'ExtraHours']
14


In [0]:
# Union + Duplicate Handling
# 20. Append dummy intern record using unionByName()
intern_data = [
    ("E200", "Sara", "IT", "Gamma", 5, "2024-05-05", "Chennai", "Remote",
     "Sunday", 5, "Partial", 1.0, "Remote Worker", 0)
]

columns = df.columns  # Ensure column count and order matches
intern_df = spark.createDataFrame(intern_data, schema=columns)
df = df.unionByName(intern_df)
df.show()

# 21. Remove duplicate rows based on all columns.
df = df.dropDuplicates()
df.show()

+----------+-----+----------+-------+---------+----------+---------+------------+-------+-----------------+----------------+-----------+-------------+----------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|Weekday|RunningTotalHours|WorkloadCategory|RemoteRatio|   WorkerType|ExtraHours|
+----------+-----+----------+-------+---------+----------+---------+------------+-------+-----------------+----------------+-----------+-------------+----------+
|      E101|Anita|        IT|  Alpha|        8|      NULL|Bangalore|      Remote|   NULL|                8|            Full|        1.0|Remote Worker|         0|
|      E101|Anita|        IT|  Alpha|        9|      NULL|Bangalore|      Remote|   NULL|               17|            Full|        1.0|Remote Worker|         1|
|      E102|  Raj|        HR|   Beta|        7|      NULL|   Mumbai|      Onsite|   NULL|                7|         Partial|        0.5|Hybrid/Onsite|         0|
|      E102|  Raj|        HR