In [0]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

dbutils.fs.cp("file:/Workspace/Shared/employee_data.csv", "dbfs:/FileStore/employee_data.csv")
 

employee_df = spark.read.format("csv").option("header", "true").load("/FileStore/employee_data.csv")

# Task
# Load the CSV Data and display first 10 rows
employee_df.show(10)

# Data Cleaning
# Remove rows where the salary is less than 55,000
df_cleaned = employee_df.filter(col("salary") >= 55000)

# Filter the employees who joined after the year 2020.
df_cleaned = df_cleaned.withColumn("JoiningDate", F.to_date(col("JoiningDate"),"yyyy-MM-dd"))
df_filtered = df_cleaned.filter(F.year(col("JoiningDate"))>2020)
df_filtered.show()

# Data aggregation

df_avg_salary = employee_df.groupBy("Department").agg(F.avg("Salary").alias("AverageSalary"))
df_avg_salary.show()

df_employee_count = employee_df.groupBy("Department").agg(F.count("EmployeeID").alias("EmployeeCount"))
df_employee_count.show()

# Write data to csv
df_filtered.write.format("csv").option("header", "true").save("/FileStore/employee_filtered.csv")
 

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------