In [32]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, coalesce, col, lit

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder.master('local[*]').getOrCreate()

class Emp:
    def createData(self):
        data = [
            ('1', 'A', None, 'X'),
            ('2', 'B', 200, 'Y'),
            ('3', 'C', 300, 'Z'),
            ('4', 'D', None, 'Y'),
            ('5', 'E', 50, 'Z')
        ]
        columns = ['emp_id', 'emp_name', 'salary', 'department']
        return spark.createDataFrame(data, columns)

    def avg_salary_sql(self, input_df):
        input_df.createOrReplaceTempView('emp')
        avg_salary_df = spark.sql("""
                SELECT distinct AVG(COALESCE(salary, 0)) AS avg_salary
                FROM emp"""
            )
        
        avg_salary = avg_salary_df.collect()[0]['avg_salary']
        output_df = spark.sql(f"""
            SELECT emp_id, emp_name, 
                   COALESCE(salary, {avg_salary}) AS salary, 
                   department
            FROM emp
        """) 
            
        return output_df
    
    def fill_missing_salary(self, input_df):
        # Calculate the average salary excluding None values
        avg_salary_df = input_df.select(avg(coalesce(col('salary'),lit(0))).alias('avg_salary'))
        avg_salary_row = avg_salary_df.collect()[0]
        avg_salary = avg_salary_row['avg_salary']
        output_df = input_df.withColumn('salary', coalesce(col('salary'), lit(avg_salary)))
        return output_df

obj = Emp()
input_df = obj.createData()
input_df.show()
output = obj.avg_salary_sql(input_df)
output.show()
output = obj.fill_missing_salary(input_df)
output.show()

+------+--------+------+----------+
|emp_id|emp_name|salary|department|
+------+--------+------+----------+
|     1|       A|  null|         X|
|     2|       B|   200|         Y|
|     3|       C|   300|         Z|
|     4|       D|  null|         Y|
|     5|       E|    50|         Z|
+------+--------+------+----------+

Row(avg_salary=110.0)
+------+--------+------+----------+
|emp_id|emp_name|salary|department|
+------+--------+------+----------+
|     1|       A| 110.0|         X|
|     2|       B| 200.0|         Y|
|     3|       C| 300.0|         Z|
|     4|       D| 110.0|         Y|
|     5|       E|  50.0|         Z|
+------+--------+------+----------+

