In [0]:
# Define the base file path as a parameter
base_file_path = "file:/Workspace/Users/yasodhashree91@gmail.com/oms-databricks/00_demo/sample_data/"

# Read each file into a named DataFrame using the parameterized base file path
employees_branch_a = spark.read.csv(base_file_path + "employees_branch_a.csv", header=True)
employees_branch_b = spark.read.csv(base_file_path + "employees_branch_b.csv", header=True)
job_history_branch_a = spark.read.csv(base_file_path + "job_history_branch_a.csv", header=True)
job_history_branch_b = spark.read.csv(base_file_path + "job_history_branch_b.csv", header=True)

In [0]:
# Display a few rows from each DataFrame
displayHTML("<h2><b>employees_branch_a:</b></h2>")
display(employees_branch_a.limit(5))

displayHTML("<h2><b>employees_branch_b:</b></h2>")
display(employees_branch_b.limit(5))

displayHTML("<h2><b>job_history_branch_a:</b></h2>")
display(job_history_branch_a.limit(5))

displayHTML("<h2><b>job_history_branch_b:</b></h2>")
display(job_history_branch_b.limit(5))

In [0]:
from pyspark.sql import functions as F

In [0]:
# Typecasting employees DataFrame
employees_branch_a = employees_branch_a.withColumn("age", F.col("age").cast("int")) \
    .withColumn("salary", F.col("salary").cast("double")) \
    .withColumn("join_date", F.col("join_date").cast("date"))

employees_branch_b = employees_branch_b.withColumn("age", F.col("age").cast("int")) \
    .withColumn("salary", F.col("salary").cast("double")) \
    .withColumn("join_date", F.col("join_date").cast("date"))

# Typecasting job_history DataFrame
job_history_branch_a = job_history_branch_a.withColumn("start_date", F.col("start_date").cast("date")) \
    .withColumn("end_date", F.col("end_date").cast("date"))

job_history_branch_b = job_history_branch_b.withColumn("start_date", F.col("start_date").cast("date")) \
    .withColumn("end_date", F.col("end_date").cast("date"))


In [0]:
# Union the employees datasets and job history datasets
employees_union = employees_branch_a.union(employees_branch_b)
job_history_union = job_history_branch_a.union(job_history_branch_b)

In [0]:
# Join the unioned employees with the unioned job history
joined_data = employees_union.join(job_history_union, on="employee_id", how="inner")

In [0]:
display(joined_data.limit(10))

In [0]:
# Filter rows with salary more than 50000
filtered_data = joined_data.filter(F.col("salary") > 50000)

In [0]:
# Select only relevant columns using 'select' or 'drop'
selected_data = filtered_data.select("employee_id", "name", "age", "salary", "department", "join_date", "gender", "role", "start_date", "end_date")

# Alternatively, use 'drop' to exclude a column (in this case, 'drop' may be more readable)
# selected_data = filtered_data.drop("branch")

In [0]:
# Sort based on employee_id and start_date (desc)
sorted_data = selected_data.orderBy(["employee_id", "start_date"], ascending=[True, False])

In [0]:
# Deduplicate to keep the latest active role for each employee
deduped_data = sorted_data.dropDuplicates(["employee_id"])

In [0]:
# Calculate years_in_service using join_date and current date
data_with_years = deduped_data.withColumn(
    "years_in_service", 
    F.floor(F.datediff(F.current_date(), F.to_date(F.col("join_date"))) / 365)
)

In [0]:
# Drop rows where the role is Null
cleaned_data = data_with_years.dropna(subset=["role"])

# Replace Null values in age with 0
cleaned_data = cleaned_data.fillna({"age": 0})

In [0]:
# Replace 'M' with 'Male', 'F' with 'Female', and any other value with 'Other' in the 'gender' column
cleaned_data = cleaned_data.withColumn(
    "gender", 
    F.when(F.col("gender") == "M", "Male")
     .when(F.col("gender") == "F", "Female")
     .otherwise("Other")
)

In [0]:
# Calculate the average, minimum, and maximum salary for each role
aggregated_data = cleaned_data.groupBy("role").agg(
    F.avg("salary").alias("avg_salary"),
    F.min("salary").alias("min_salary"),
    F.max("salary").alias("max_salary")
)

In [0]:
# We are just displaying the data here, but these dataframes can be loaded into a table as shown in the next cell
displayHTML("<h2><b>cleaned_data:</b></h2>")
display(cleaned_data.limit(5))

displayHTML("<h2><b>aggregated_data:</b></h2>")
display(aggregated_data.limit(5))

Databricks visualization. Run in Databricks to view.

In [0]:
import matplotlib.pyplot as plt

pdf = aggregated_data.limit(5).toPandas()
pdf.plot(kind="bar", x="role", y="avg_salary", color="skyblue", figsize=(10, 5))

plt.xlabel("Role")
plt.ylabel("Average Salary")
plt.title("Average Salary by Role")
plt.xticks(rotation=45)
plt.show()


In [0]:
# Save the cleaned employee data as a table in the oms_analytics.default schema
# cleaned_data \
#     .write \
#     .mode("overwrite") \
#     .saveAsTable("oms_analytics.default.employee_details")

# Save the aggregated employee role summary data as a table in the oms_analytics.default schema
# aggregated_data \
#     .write \
#     .mode("overwrite") \
#     .saveAsTable("oms_analytics.default.employee_role_summary")
