#Problem Statement
The question goes as follows we need to obtain the list of departments and average lower the overall average salary of the company a member when calculating the companies average salary you must include the salaries of the department you are comparing with it for instance when comparing the  average salary of the which department with the companies average the HR department salary shouldn't be taken into considerations for the calculation of the company average salary likewise if you want to compare the average salary of the Finance Department with the companies average the companies average salary should not include the salaries of the finance department and show on essentially the companies average salary will be dynamic for each department.

In [0]:
#import nessesary functions
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
# Define schema
schema = StructType([
    StructField("emp_id", IntegerType(), nullable=False),
    StructField("emp_name", StringType(), nullable=False),
    StructField("department_id", IntegerType(), nullable=False),
    StructField("salary", IntegerType(), nullable=False),
    StructField("manager_id", IntegerType(), nullable=False),
    StructField("emp_age", IntegerType(), nullable=False)
])
# Create data
data = [
    (1, 'Ankit', 100, 10000, 4, 39),
    (2, 'Mohit', 100, 15000, 5, 48),
    (3, 'Vikas', 100, 10000, 4, 37),
    (4, 'Rohit', 100, 5000, 2, 16),
    (5, 'Mudit', 200, 12000, 6, 55),
    (6, 'Agam', 200, 12000, 2, 14),
    (7, 'Sanjay', 200, 9000, 2, 13),
    (8, 'Ashish', 200, 5000, 2, 12),
    (9, 'Mukesh', 300, 6000, 6, 51),
    (10, 'Rakesh', 300, 7000, 6, 50)
]
# Create DataFrame
df = spark.createDataFrame(data, schema)
# Show DataFrame
df.display()

emp_id,emp_name,department_id,salary,manager_id,emp_age
1,Ankit,100,10000,4,39
2,Mohit,100,15000,5,48
3,Vikas,100,10000,4,37
4,Rohit,100,5000,2,16
5,Mudit,200,12000,6,55
6,Agam,200,12000,2,14
7,Sanjay,200,9000,2,13
8,Ashish,200,5000,2,12
9,Mukesh,300,6000,6,51
10,Rakesh,300,7000,6,50


In [0]:
# Save DataFrame as a table (in-memory or Hive table if using Hive)
# To save as an in-memory table
df.createOrReplaceTempView("emp")
# To save as a Hive table (ensure Hive support is enabled in Spark)
df.write.saveAsTable("emp")
# Now you can run SQL queries on the "emp" table
spark.sql("SELECT * FROM emp").show()

+------+--------+-------------+------+----------+-------+
|emp_id|emp_name|department_id|salary|manager_id|emp_age|
+------+--------+-------------+------+----------+-------+
|     1|   Ankit|          100| 10000|         4|     39|
|     2|   Mohit|          100| 15000|         5|     48|
|     3|   Vikas|          100| 10000|         4|     37|
|     4|   Rohit|          100|  5000|         2|     16|
|     5|   Mudit|          200| 12000|         6|     55|
|     6|    Agam|          200| 12000|         2|     14|
|     7|  Sanjay|          200|  9000|         2|     13|
|     8|  Ashish|          200|  5000|         2|     12|
|     9|  Mukesh|          300|  6000|         6|     51|
|    10|  Rakesh|          300|  7000|         6|     50|
+------+--------+-------------+------+----------+-------+



In [0]:
from pyspark.sql.functions import col, avg, sum as spark_sum, count
# Calculate total company salary and employee count
total_salary = df.agg(spark_sum(col("salary")).alias("total_salary")).collect()[0]["total_salary"]
total_count = df.agg(count("*").alias("total_count")).collect()[0]["total_count"]

# Calculate average salary for each department
dept_avg_salary_df = df.groupBy("department_id").agg(
    avg("salary").alias("dept_avg_salary"),
    spark_sum("salary").alias("dept_total_salary"),
    count("emp_id").alias("dept_employee_count")
)

# Calculate dynamic company average salary for each department
result_df = dept_avg_salary_df.withColumn(
    "dynamic_company_avg_salary",
    (total_salary - col("dept_total_salary")) / (total_count - col("dept_employee_count"))
).filter(col("dept_avg_salary") < col("dynamic_company_avg_salary"))

# Show the result
result=result_df.select("department_id", "dept_avg_salary", "dynamic_company_avg_salary")
result.display()


department_id,dept_avg_salary,dynamic_company_avg_salary
300,6500.0,9750.0


In [0]:
%sql
-- Calculate the average salary for each department
WITH dept_avg_salary AS (
    SELECT
        department_id,
        AVG(salary) AS dept_avg_salary,
        SUM(salary) AS dept_total_salary,
        COUNT(emp_id) AS dept_employee_count
    FROM emp
    GROUP BY department_id
),
-- Calculate the total company salary and total employee count
company_totals AS (
    SELECT
        SUM(salary) AS total_salary,
        COUNT(emp_id) AS total_count
    FROM emp
),
-- Calculate the dynamic company average salary for each department
dynamic_company_avg_salary AS (
    SELECT
        d.department_id,
        d.dept_avg_salary,
        (c.total_salary - d.dept_total_salary) / (c.total_count - d.dept_employee_count) AS dynamic_company_avg_salary
    FROM
        dept_avg_salary d,
        company_totals c
)
-- Compare and select departments with average salaries lower than the dynamic company average salary
SELECT
    department_id,
    dept_avg_salary,
    dynamic_company_avg_salary
FROM
    dynamic_company_avg_salary
WHERE
    dept_avg_salary < dynamic_company_avg_salary;

department_id,dept_avg_salary,dynamic_company_avg_salary
300,6500.0,9750.0
