In [18]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import lit
#lit which will allow to create a literal column
from pyspark.sql.types import StringType

In [19]:
df = sc.parallelize([Row(server_name='101 Server', cpu_utilization=85, session_count=80), \
                             Row(server_name='101 Server', cpu_utilization=80, session_count=90),
                             Row(server_name='102 Server', cpu_utilization=85, session_count=40),
                             Row(server_name='103 Server', cpu_utilization=70, session_count=80),
                             Row(server_name='104 Server', cpu_utilization=60, session_count=80)]).toDF()

In [20]:
df.show()

+-----------+---------------+-------------+
|server_name|cpu_utilization|session_count|
+-----------+---------------+-------------+
| 101 Server|             85|           80|
| 101 Server|             80|           90|
| 102 Server|             85|           40|
| 103 Server|             70|           80|
| 104 Server|             60|           80|
+-----------+---------------+-------------+



In [21]:
# add column na_col with literal value of None/Null and cast as String Type

In [22]:
df_na = df.withColumn('na_col', lit(None).cast(StringType()))

In [23]:
df_na.show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|  NULL|
| 101 Server|             80|           90|  NULL|
| 102 Server|             85|           40|  NULL|
| 103 Server|             70|           80|  NULL|
| 104 Server|             60|           80|  NULL|
+-----------+---------------+-------------+------+



In [24]:
df_na.fillna('A').show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
+-----------+---------------+-------------+------+



In [25]:
df2 = df_na.fillna('A').union(df_na)

In [26]:
df2.show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
| 101 Server|             85|           80|  NULL|
| 101 Server|             80|           90|  NULL|
| 102 Server|             85|           40|  NULL|
| 103 Server|             70|           80|  NULL|
| 104 Server|             60|           80|  NULL|
+-----------+---------------+-------------+------+



In [27]:
df2.na.drop().show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
+-----------+---------------+-------------+------+



In [28]:
df2.createOrReplaceTempView("na_table")

In [29]:
spark.sql("SELECT * FROM na_table").show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
| 101 Server|             85|           80|  NULL|
| 101 Server|             80|           90|  NULL|
| 102 Server|             85|           40|  NULL|
| 103 Server|             70|           80|  NULL|
| 104 Server|             60|           80|  NULL|
+-----------+---------------+-------------+------+



In [31]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NULL").show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|  NULL|
| 101 Server|             80|           90|  NULL|
| 102 Server|             85|           40|  NULL|
| 103 Server|             70|           80|  NULL|
| 104 Server|             60|           80|  NULL|
+-----------+---------------+-------------+------+



In [32]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NOT NULL").show()

+-----------+---------------+-------------+------+
|server_name|cpu_utilization|session_count|na_col|
+-----------+---------------+-------------+------+
| 101 Server|             85|           80|     A|
| 101 Server|             80|           90|     A|
| 102 Server|             85|           40|     A|
| 103 Server|             70|           80|     A|
| 104 Server|             60|           80|     A|
+-----------+---------------+-------------+------+

