In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

spark = SparkSession.builder.appName("NullHandlingDemo").getOrCreate()

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("country", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("sales", IntegerType(), True),
    StructField("date", DateType(), True)
])

data = [
    (1, "India", "IT", 50000, 100, None),
    (2, "India", None, None, 200, None),
    (3, "India", "HR", 40000, None, None),
    (4, "USA", "IT", 60000, 300, None),
    (5, "USA", "HR", None, 100, None),
    (None, "USA", "HR", 45000, 150, None)
]

df = spark.createDataFrame(data, schema)

df.show()


+----+-------+----------+------+-----+----+
|  id|country|department|salary|sales|date|
+----+-------+----------+------+-----+----+
|   1|  India|        IT| 50000|  100|null|
|   2|  India|      null|  null|  200|null|
|   3|  India|        HR| 40000| null|null|
|   4|    USA|        IT| 60000|  300|null|
|   5|    USA|        HR|  null|  100|null|
|null|    USA|        HR| 45000|  150|null|
+----+-------+----------+------+-----+----+



1. Detecting NULL Values
Using isNull() and isNotNull() \
Use case: Data quality checks, validations

In [2]:
from pyspark.sql.functions import col

df.filter(col("salary").isNull()).show()
df.filter(col("salary").isNotNull()).show()

+---+-------+----------+------+-----+----+
| id|country|department|salary|sales|date|
+---+-------+----------+------+-----+----+
|  2|  India|      null|  null|  200|null|
|  5|    USA|        HR|  null|  100|null|
+---+-------+----------+------+-----+----+

+----+-------+----------+------+-----+----+
|  id|country|department|salary|sales|date|
+----+-------+----------+------+-----+----+
|   1|  India|        IT| 50000|  100|null|
|   3|  India|        HR| 40000| null|null|
|   4|    USA|        IT| 60000|  300|null|
|null|    USA|        HR| 45000|  150|null|
+----+-------+----------+------+-----+----+



1. Removing NULL Values \
A. Drop rows containing any NULL

In [3]:
df.dropna().show()

+---+-------+----------+------+-----+----+
| id|country|department|salary|sales|date|
+---+-------+----------+------+-----+----+
+---+-------+----------+------+-----+----+



B. Drop rows with NULLs in specific columns

In [4]:
df.dropna(subset=["salary", "department"]).show()

+----+-------+----------+------+-----+----+
|  id|country|department|salary|sales|date|
+----+-------+----------+------+-----+----+
|   1|  India|        IT| 50000|  100|null|
|   3|  India|        HR| 40000| null|null|
|   4|    USA|        IT| 60000|  300|null|
|null|    USA|        HR| 45000|  150|null|
+----+-------+----------+------+-----+----+



5. Replacing NULL Values \
A. Using fillna() (Most Common)

In [9]:
df.fillna(0).show()

+---+-------+----------+------+-----+----+
| id|country|department|salary|sales|date|
+---+-------+----------+------+-----+----+
|  1|  India|        IT| 50000|  100|null|
|  2|  India|      null|     0|  200|null|
|  3|  India|        HR| 40000|    0|null|
|  4|    USA|        IT| 60000|  300|null|
|  5|    USA|        HR|     0|  100|null|
|  0|    USA|        HR| 45000|  150|null|
+---+-------+----------+------+-----+----+



B. Using coalesce() (Preferred in expressions)

In [10]:
from pyspark.sql.functions import coalesce, lit

df.withColumn(
"salary_clean",
coalesce(col("salary"), lit(0))
).show()

+----+-------+----------+------+-----+----+------------+
|  id|country|department|salary|sales|date|salary_clean|
+----+-------+----------+------+-----+----+------------+
|   1|  India|        IT| 50000|  100|null|       50000|
|   2|  India|      null|  null|  200|null|           0|
|   3|  India|        HR| 40000| null|null|       40000|
|   4|    USA|        IT| 60000|  300|null|       60000|
|   5|    USA|        HR|  null|  100|null|           0|
|null|    USA|        HR| 45000|  150|null|       45000|
+----+-------+----------+------+-----+----+------------+



6. NULLs in Aggregations
Important Behaviors

sum(), avg(), min(), max() → Ignore NULLs

count(column) → Counts non-NULL values

count(*) → Counts all rows

In [11]:
from pyspark.sql.functions import sum, avg, count

df.select(
sum("salary"),
avg("salary"),
count("salary"),
count("*")
).show()

+-----------+-----------+-------------+--------+
|sum(salary)|avg(salary)|count(salary)|count(1)|
+-----------+-----------+-------------+--------+
|     195000|    48750.0|            4|       6|
+-----------+-----------+-------------+--------+



In [12]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, coalesce, lit


window = Window.partitionBy("country").orderBy("date")


df.withColumn(
"running_total",
sum(coalesce(col("sales"), lit(0))).over(window)
).show()

+----+-------+----------+------+-----+----+-------------+
|  id|country|department|salary|sales|date|running_total|
+----+-------+----------+------+-----+----+-------------+
|   1|  India|        IT| 50000|  100|null|          300|
|   2|  India|      null|  null|  200|null|          300|
|   3|  India|        HR| 40000| null|null|          300|
|   4|    USA|        IT| 60000|  300|null|          550|
|   5|    USA|        HR|  null|  100|null|          550|
|null|    USA|        HR| 45000|  150|null|          550|
+----+-------+----------+------+-----+----+-------------+

