# isNull, isNotNull, na.drop, na.fill

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("house_price.csv", inferSchema=True, 
                   header=True)

In [4]:
df.show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          7|          5|     1710|       548|   208500|
|          6|          8|     1262|       460|   181500|
|          7|          5|     1786|       608|   223500|
|          7|          5|     1717|       642|     null|
|          8|          5|     2198|       836|     null|
|          5|          5|     1362|       480|   143000|
|          8|          5|     null|       636|   307000|
|          7|          6|     null|       484|   200000|
|          7|          5|     1774|       468|   129900|
|          5|          6|     1077|       205|   118000|
|          5|          5|     1040|       384|   129500|
|          9|          5|     2324|      null|   345000|
|          5|       null|      912|      null|   144000|
|          7|       null|     1494|      null|   279500|
|          6|          5|     1

In [5]:
df.printSchema()

root
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- GrLivArea: integer (nullable = true)
 |-- GarageArea: integer (nullable = true)
 |-- SalePrice: integer (nullable = true)



In [6]:
df.filter(df.SalePrice.isNotNull()).show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          7|          5|     1710|       548|   208500|
|          6|          8|     1262|       460|   181500|
|          7|          5|     1786|       608|   223500|
|          5|          5|     1362|       480|   143000|
|          8|          5|     null|       636|   307000|
|          7|          6|     null|       484|   200000|
|          7|          5|     1774|       468|   129900|
|          5|          6|     1077|       205|   118000|
|          5|          5|     1040|       384|   129500|
|          9|          5|     2324|      null|   345000|
|          5|       null|      912|      null|   144000|
|          7|       null|     1494|      null|   279500|
|          6|          5|     1253|       352|   157000|
|          7|          8|      854|       576|   132000|
|          6|          7|     1

In [7]:
df.filter(df.SalePrice.isNull()).show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          7|          5|     1717|       642|     null|
|          8|          5|     2198|       836|     null|
+-----------+-----------+---------+----------+---------+



In [9]:
df.filter(df.OverallCond.isNull()).show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          5|       null|      912|      null|   144000|
|          7|       null|     1494|      null|   279500|
|       null|       null|     1296|       516|    90000|
+-----------+-----------+---------+----------+---------+



In [10]:
df.na.drop().show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          7|          5|     1710|       548|   208500|
|          6|          8|     1262|       460|   181500|
|          7|          5|     1786|       608|   223500|
|          5|          5|     1362|       480|   143000|
|          7|          5|     1774|       468|   129900|
|          5|          6|     1077|       205|   118000|
|          5|          5|     1040|       384|   129500|
|          6|          5|     1253|       352|   157000|
|          7|          8|      854|       576|   132000|
|          6|          7|     1004|       480|   149000|
|          5|          5|     1114|       576|   159000|
+-----------+-----------+---------+----------+---------+



In [11]:
df.na.fill({"GarageArea": 500, "SalePrice": 12345}).show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          7|          5|     1710|       548|   208500|
|          6|          8|     1262|       460|   181500|
|          7|          5|     1786|       608|   223500|
|          7|          5|     1717|       642|    12345|
|          8|          5|     2198|       836|    12345|
|          5|          5|     1362|       480|   143000|
|          8|          5|     null|       636|   307000|
|          7|          6|     null|       484|   200000|
|          7|          5|     1774|       468|   129900|
|          5|          6|     1077|       205|   118000|
|          5|          5|     1040|       384|   129500|
|          9|          5|     2324|       500|   345000|
|          5|       null|      912|       500|   144000|
|          7|       null|     1494|       500|   279500|
|          6|          5|     1

In [15]:
df.show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          7|          5|     1710|       548|   208500|
|          6|          8|     1262|       460|   181500|
|          7|          5|     1786|       608|   223500|
|          7|          5|     1717|       642|    12345|
|          8|          5|     2198|       836|    12345|
|          5|          5|     1362|       480|   143000|
|          8|          5|     null|       636|   307000|
|          7|          6|     null|       484|   200000|
|          7|          5|     1774|       468|   129900|
|          5|          6|     1077|       205|   118000|
|          5|          5|     1040|       384|   129500|
|          9|          5|     2324|       500|   345000|
|          5|       null|      912|       500|   144000|
|          7|       null|     1494|       500|   279500|
|          6|          5|     1

In [13]:
df = df.na.fill({"GarageArea": 500, "SalePrice": 12345})

In [14]:
df.show()

+-----------+-----------+---------+----------+---------+
|OverallQual|OverallCond|GrLivArea|GarageArea|SalePrice|
+-----------+-----------+---------+----------+---------+
|          7|          5|     1710|       548|   208500|
|          6|          8|     1262|       460|   181500|
|          7|          5|     1786|       608|   223500|
|          7|          5|     1717|       642|    12345|
|          8|          5|     2198|       836|    12345|
|          5|          5|     1362|       480|   143000|
|          8|          5|     null|       636|   307000|
|          7|          6|     null|       484|   200000|
|          7|          5|     1774|       468|   129900|
|          5|          6|     1077|       205|   118000|
|          5|          5|     1040|       384|   129500|
|          9|          5|     2324|       500|   345000|
|          5|       null|      912|       500|   144000|
|          7|       null|     1494|       500|   279500|
|          6|          5|     1