In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType

In [2]:
df = sc.parallelize([Row(server_name='101 Server', cpu_utilization=85, session_count=80), \
                             Row(server_name='101 Server', cpu_utilization=80, session_count=90),
                             Row(server_name='102 Server', cpu_utilization=85, session_count=40),
                             Row(server_name='103 Server', cpu_utilization=70, session_count=80),
                             Row(server_name='104 Server', cpu_utilization=60, session_count=80)]).toDF()

In [3]:
df.show()

+---------------+-----------+-------------+
|cpu_utilization|server_name|session_count|
+---------------+-----------+-------------+
|             85| 101 Server|           80|
|             80| 101 Server|           90|
|             85| 102 Server|           40|
|             70| 103 Server|           80|
|             60| 104 Server|           80|
+---------------+-----------+-------------+



In [4]:
df_na = df.withColumn('na_col', lit(None).cast(StringType()))

In [5]:
df_na.show()

+---------------+-----------+-------------+------+
|cpu_utilization|server_name|session_count|na_col|
+---------------+-----------+-------------+------+
|             85| 101 Server|           80|  null|
|             80| 101 Server|           90|  null|
|             85| 102 Server|           40|  null|
|             70| 103 Server|           80|  null|
|             60| 104 Server|           80|  null|
+---------------+-----------+-------------+------+



In [6]:
df_na.fillna('A').show()

+---------------+-----------+-------------+------+
|cpu_utilization|server_name|session_count|na_col|
+---------------+-----------+-------------+------+
|             85| 101 Server|           80|     A|
|             80| 101 Server|           90|     A|
|             85| 102 Server|           40|     A|
|             70| 103 Server|           80|     A|
|             60| 104 Server|           80|     A|
+---------------+-----------+-------------+------+



In [7]:
df2 = df_na.fillna('A').union(df_na)

In [8]:
df2.show()

+---------------+-----------+-------------+------+
|cpu_utilization|server_name|session_count|na_col|
+---------------+-----------+-------------+------+
|             85| 101 Server|           80|     A|
|             80| 101 Server|           90|     A|
|             85| 102 Server|           40|     A|
|             70| 103 Server|           80|     A|
|             60| 104 Server|           80|     A|
|             85| 101 Server|           80|  null|
|             80| 101 Server|           90|  null|
|             85| 102 Server|           40|  null|
|             70| 103 Server|           80|  null|
|             60| 104 Server|           80|  null|
+---------------+-----------+-------------+------+



In [9]:
df2.na.drop().show()

+---------------+-----------+-------------+------+
|cpu_utilization|server_name|session_count|na_col|
+---------------+-----------+-------------+------+
|             85| 101 Server|           80|     A|
|             80| 101 Server|           90|     A|
|             85| 102 Server|           40|     A|
|             70| 103 Server|           80|     A|
|             60| 104 Server|           80|     A|
+---------------+-----------+-------------+------+



In [10]:
df2.createOrReplaceTempView("na_table")

In [11]:
spark.sql("SELECT * FROM na_table").show()

+---------------+-----------+-------------+------+
|cpu_utilization|server_name|session_count|na_col|
+---------------+-----------+-------------+------+
|             85| 101 Server|           80|     A|
|             80| 101 Server|           90|     A|
|             85| 102 Server|           40|     A|
|             70| 103 Server|           80|     A|
|             60| 104 Server|           80|     A|
|             85| 101 Server|           80|  null|
|             80| 101 Server|           90|  null|
|             85| 102 Server|           40|  null|
|             70| 103 Server|           80|  null|
|             60| 104 Server|           80|  null|
+---------------+-----------+-------------+------+



In [12]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NULL").show()

+---------------+-----------+-------------+------+
|cpu_utilization|server_name|session_count|na_col|
+---------------+-----------+-------------+------+
|             85| 101 Server|           80|  null|
|             80| 101 Server|           90|  null|
|             85| 102 Server|           40|  null|
|             70| 103 Server|           80|  null|
|             60| 104 Server|           80|  null|
+---------------+-----------+-------------+------+



In [13]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NOT NULL").show()

+---------------+-----------+-------------+------+
|cpu_utilization|server_name|session_count|na_col|
+---------------+-----------+-------------+------+
|             85| 101 Server|           80|     A|
|             80| 101 Server|           90|     A|
|             85| 102 Server|           40|     A|
|             70| 103 Server|           80|     A|
|             60| 104 Server|           80|     A|
+---------------+-----------+-------------+------+

