# Spark DataFrame Missing Data

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("spark://spark:7077").appName("Missing").getOrCreate()

In [None]:
df = spark.read.csv("/data/ContainsNull.csv",header=True, inferSchema=True)

In [None]:
df.show()

## Dropping Null Values

In [None]:
# Use .na.drop() to drop rows with any null values
df.na.drop().show()

In [None]:
# Use .na.drop(thresh=<n>) to drop rows with at least <n> null values
df.na.drop(thresh=2).show()

In [None]:
# Use .na.drop(how=<"any"|"all">) to drop rows with any null values or all null values
df.na.drop(how="all").show()


In [None]:
# Use .na.drop(subset=<[cols]>) to drop rows with null values in subset of columns
df.na.drop(subset="Sales").show()

In [None]:
# Filling Null Values

In [None]:
df.printSchema()

In [None]:
# Spark knows to automatically fill string columns with string values amd numeric columns with numeric values
df.na.fill(0).show()
# Does not fill string column

In [None]:
# Fill specific columns with specific value
df.na.fill("No Name", subset=["Name"]).show()

In [None]:
# Using computed mean value to fill column
from pyspark.sql.functions import mean

In [None]:
mean_val = df.select(mean(df["Sales"])).collect()

In [None]:
# .collect() returns list of rows
mean_val

In [None]:
print(mean_val[0][0])
mean_sales = mean_val[0][0]

In [None]:
df.na.fill(mean_sales, subset=["Sales"]).show()