### 1. Scenario: Calculate the total salary per department and sort them by total salary in descending order.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum 

# Sample data
data = [
    ("Alice", "HR", 50000),
    ("Bob", "IT", 70000),
    ("Carol", "IT", 80000),
    ("Dave", "HR", 55000),
    ("Eve", "Finance", 60000),
    ("Frank", "Finance", 65000),
    ("Grace", "IT", 75000)
]

columns = ["name", "department", "salary"]

# Create DataFrame
emp = spark.createDataFrame(data, columns)

# Show the original DataFrame
emp.show()


+-----+----------+------+
| name|department|salary|
+-----+----------+------+
|Alice|        HR| 50000|
|  Bob|        IT| 70000|
|Carol|        IT| 80000|
| Dave|        HR| 55000|
|  Eve|   Finance| 60000|
|Frank|   Finance| 65000|
|Grace|        IT| 75000|
+-----+----------+------+



In [0]:
# Calculate total salary per department and sort descending
dept_salary_df = emp.groupBy("department").agg(
    sum("salary").alias("total_salary")
).orderBy("total_salary" , ascending = False)

# Show result
dept_salary_df.show()


+----------+------------+
|department|total_salary|
+----------+------------+
|        IT|      225000|
|   Finance|      125000|
|        HR|      105000|
+----------+------------+



### 2.Scenario: Find the highest and lowest salary per department You are given a dataset with employee details (name, department, salary). How would you find the highest and lowest salary in each department?



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import max, min

# Initialize SparkSession
spark = SparkSession.builder.appName("SalaryStats").getOrCreate()

# Sample data
data = [
    ("Alice", "HR", 50000),
    ("Bob", "HR", 60000),
    ("Charlie", "IT", 70000),
    ("David", "IT", 90000),
    ("Eve", "Sales", 45000),
    ("Frank", "Sales", 55000)
]

# Create DataFrame
columns = ["name", "department", "salary"]
df = spark.createDataFrame(data, columns)

# Show original data
df.show()

# Find highest and lowest salary per department
result_df = df.groupBy("department").agg(
    max("salary").alias("max_salary"),
    min("salary").alias("min_salary")
)

# Show result
result_df.show()


+-------+----------+------+
|   name|department|salary|
+-------+----------+------+
|  Alice|        HR| 50000|
|    Bob|        HR| 60000|
|Charlie|        IT| 70000|
|  David|        IT| 90000|
|    Eve|     Sales| 45000|
|  Frank|     Sales| 55000|
+-------+----------+------+

+----------+----------+----------+
|department|max_salary|min_salary|
+----------+----------+----------+
|        HR|     60000|     50000|
|        IT|     90000|     70000|
|     Sales|     55000|     45000|
+----------+----------+----------+



### 3. Scenario: Count the number of employees in each department and identify departments with fewer than 5 employees.
### Question: Given a dataset with employee details (name, department, salary), how would you count the number of employees in each department and filter out the departments with fewer than 5 employees?



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count


# Sample data
data = [
    ("Alice", "HR", 50000),
    ("Bob", "HR", 60000),
    ("Charlie", "IT", 70000),
    ("David", "IT", 90000),
    ("Eve", "Sales", 45000),
    ("Frank", "Sales", 55000),
    ("Grace", "Sales", 47000),
    ("Hank", "Sales", 52000),
    ("Ivy", "Sales", 49000),
    ("Jack", "IT", 75000)
]

# Create DataFrame
columns = ["name", "department", "salary"]
df = spark.createDataFrame(data, columns)

# Count employees per department
dept_counts = df.groupBy("department").agg(count("*").alias("employee_count"))

# Filter departments with fewer than 5 employees
filtered_depts = dept_counts.filter("employee_count < 5")

# Show result
filtered_depts.show()


+----------+--------------+
|department|employee_count|
+----------+--------------+
|        HR|             2|
|        IT|             3|
+----------+--------------+




### 4.Scenario: Find the average salary of employees in each department and compare it to the overall average salary.
### Question: You are given a dataset of employee details. How would you find the average salary in each department and then compare it to the overall average salary?





In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col

# Step 2: Sample employee data
data = [
    ("Alice", "HR", 50000),
    ("Bob", "HR", 60000),
    ("Charlie", "IT", 70000),
    ("David", "IT", 90000),
    ("Eve", "Sales", 45000),
    ("Frank", "Sales", 55000),
    ("Grace", "Sales", 47000),
    ("Hank", "Sales", 52000),
    ("Ivy", "Sales", 49000),
    ("Jack", "IT", 75000)
]

columns = ["name", "department", "salary"]
df = spark.createDataFrame(data, columns)

# Show the original DataFrame
df.show()

# Step 3: Calculate average salary per department
avg_salary_dept = df.groupBy("department").agg(avg("salary").alias("avg_salary_dept"))

# Step 4: Calculate overall average salary
overall_avg_salary_row = df.agg(avg("salary").alias("overall_avg_salary")).collect()[0]
overall_avg_salary = overall_avg_salary_row["overall_avg_salary"]

# Step 5: Add a column to compare with overall average
comparison_df = avg_salary_dept.withColumn("overall_avg_salary", col("avg_salary_dept") - overall_avg_salary)

# Optional: Add a column to indicate whether dept avg is above or below overall
comparison_df = comparison_df.withColumn(
    "above_or_below_avg",
    col("overall_avg_salary") > 0
)

# Show the final result
comparison_df.show()


+-------+----------+------+
|   name|department|salary|
+-------+----------+------+
|  Alice|        HR| 50000|
|    Bob|        HR| 60000|
|Charlie|        IT| 70000|
|  David|        IT| 90000|
|    Eve|     Sales| 45000|
|  Frank|     Sales| 55000|
|  Grace|     Sales| 47000|
|   Hank|     Sales| 52000|
|    Ivy|     Sales| 49000|
|   Jack|        IT| 75000|
+-------+----------+------+

+----------+-----------------+------------------+------------------+
|department|  avg_salary_dept|overall_avg_salary|above_or_below_avg|
+----------+-----------------+------------------+------------------+
|        HR|          55000.0|           -4300.0|             false|
|        IT|78333.33333333333| 19033.33333333333|              true|
|     Sales|          49600.0|           -9700.0|             false|
+----------+-----------------+------------------+------------------+



###  5.Scenario: Find departments with the maximum number of employees.
### Question: You are given a dataset with employee details. How would you find which department has the most employees?



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

# Initialize Spark session
spark = SparkSession.builder.appName("MaxEmployeesByDept").getOrCreate()

# Sample data
data = [
    ("Alice", "HR"),
    ("Bob", "IT"),
    ("Carol", "IT"),
    ("Dave", "HR"),
    ("Eve", "Finance"),
    ("Frank", "Finance"),
    ("Grace", "IT"),
    ("Hank", "IT")
]

columns = ["name", "department"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Count employees per department and sort by count descending
dept_counts = df.groupBy("department").agg(
    count("name").alias("employee_count")
).orderBy("employee_count", ascending=False)

# Show result
dept_counts.show()


+----------+--------------+
|department|employee_count|
+----------+--------------+
|        IT|             4|
|        HR|             2|
|   Finance|             2|
+----------+--------------+

